In [1]:
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup # For HTML parsing
import requests
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
#from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import nltk
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline

In [2]:
import csv
import datetime
import time

In [3]:
import sqlalchemy
from sqlalchemy import create_engine

In [4]:
%load_ext watermark

This notebook is written by Yishin and Chi-Hung.


In [9]:
%watermark


2017-02-05T19:42:47+08:00

CPython 3.5.2
IPython 5.1.0

compiler   : GCC 4.2.1 Compatible Apple LLVM 7.3.0 (clang-703.0.31)
system     : Darwin
release    : 15.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

First of all, we know that there are 7 types of vacuums on Amazon


In [10]:
def getVacuumTypeUrl(vacuumType,pageNum=1):
    vcleaners={"central":11333709011,"canister":510108,"handheld":510114,"robotic":3743561,"stick":510112,"upright":510110,"wetdry":553022}
    url_type_base="https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_"+str(pageNum)+"?ie=UTF8&node="
    url=url_type_base+str(vacuumType)+"&page="+str(pageNum)
    print (url)
    return url

In [11]:
vcleaners={"central":11333709011,"canister":510108,"handheld":510114,"robotic":3743561,"stick":510112,"upright":510110,"wetdry":553022}

for key in vcleaners:
    print(key,vcleaners[key])
    getVacuumTypeUrl(vcleaners[key])


stick 510112
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=510112&page=1
central 11333709011
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=11333709011&page=1
wetdry 553022
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=553022&page=1
upright 510110
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=510110&page=1
canister 510108
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=510108&page=1
robotic 3743561
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=3743561&page=1
handheld 510114
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=510114&page=1

The following are two functions which we aim to obtain the total number of pages of each vacuum type


In [12]:
def getFinalPageNum(url,maxretrytime=20):
    passed=False
    cnt=0
    
    while(passed==False):
        cnt+=1
        print("iteration from getFinalPageNum=",cnt)
        if(cnt>maxretrytime):
            raise Exception("Error from getFinalPageNum(url)! Tried too many times but we are still blocked by Amazon.")
        try:
            with requests.Session() as session:
                session.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"}            
                r=session.get(url)
                if (r.status_code==200):
                    soup=BeautifulSoup(r.content,"lxml")
                    if("Robot Check" in soup.text):
                        print("we are blocked!")
                    else:
                        tagsFinalPageNum=soup.select("span[class='pagnDisabled']")
                        finalPageNum=str(tagsFinalPageNum[0].text)
                        passed=True

                else:
                    print("Connection failed. Reconnecting...")
        except:
            print("Error from getFinalPageNum(url)! Probably due to connection time out")
    return finalPageNum

In [13]:
def InferFinalPageNum(vacuumType,pageNum=1,times=10):
    url=getVacuumTypeUrl(vacuumType,pageNum)
    
    list_finalpageNum=[]
    
    for j in range(times):
        finalpageNum=getFinalPageNum(url)
        list_finalpageNum.append(finalpageNum)
    FinalpageNum=min(list_finalpageNum)

    return FinalpageNum

In [14]:
FinalPageNum=InferFinalPageNum(510114,pageNum=1)
print('FinalPageNum=',FinalPageNum)


https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=510114&page=1
iteration from getFinalPageNum= 1
iteration from getFinalPageNum= 1
we are blocked!
iteration from getFinalPageNum= 2
we are blocked!
iteration from getFinalPageNum= 3
we are blocked!
iteration from getFinalPageNum= 4
we are blocked!
iteration from getFinalPageNum= 5
iteration from getFinalPageNum= 1
iteration from getFinalPageNum= 1
we are blocked!
iteration from getFinalPageNum= 2
we are blocked!
iteration from getFinalPageNum= 3
we are blocked!
iteration from getFinalPageNum= 4
we are blocked!
iteration from getFinalPageNum= 5
iteration from getFinalPageNum= 1
we are blocked!
iteration from getFinalPageNum= 2
we are blocked!
iteration from getFinalPageNum= 3
iteration from getFinalPageNum= 1
we are blocked!
iteration from getFinalPageNum= 2
we are blocked!
iteration from getFinalPageNum= 3
iteration from getFinalPageNum= 1
iteration from getFinalPageNum= 1
iteration from getFinalPageNum= 1
iteration from getFinalPageNum= 1
FinalPageNum= 84

So, right now, we are able to infer the total number of pages of a specific vacuum type.

The next step is to generate all URLs of the selected vacuum type:


In [15]:
def urlsGenerator(typenode,FinalPageNum):
    #Note: 'typenode' and 'FinalpageNum' are both string

    URLs=[]
    pageIdx=1
    while(pageIdx<=int(FinalPageNum)):
        url_Type="https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_"+str(pageIdx)+"?ie=UTF8&node="
        url=url_Type+str(typenode)+"&page="+str(pageIdx)
        #print(url)
        URLs.append(url)
        pageIdx+=1
   
    return URLs

For the moment, let us choose the vacuum type "handheld":


In [16]:
URLs=urlsGenerator(510114,FinalPageNum)
len(URLs)
for url in URLs:
    print(url)


https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=510114&page=1
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_2?ie=UTF8&node=510114&page=2
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_3?ie=UTF8&node=510114&page=3
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_4?ie=UTF8&node=510114&page=4
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_5?ie=UTF8&node=510114&page=5
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_6?ie=UTF8&node=510114&page=6
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_7?ie=UTF8&node=510114&page=7
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_8?ie=UTF8&node=510114&page=8
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_9?ie=UTF8&node=510114&page=9
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_10?ie=UTF8&node=510114&page=10
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_11?ie=UTF8&node=510114&page=11
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_12?ie=UTF8&node=510114&page=12
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_13?ie=UTF8&node=510114&page=13
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_14?ie=UTF8&node=510114&page=14
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_15?ie=UTF8&node=510114&page=15
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_16?ie=UTF8&node=510114&page=16
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_17?ie=UTF8&node=510114&page=17
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_18?ie=UTF8&node=510114&page=18
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_19?ie=UTF8&node=510114&page=19
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_20?ie=UTF8&node=510114&page=20
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_21?ie=UTF8&node=510114&page=21
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_22?ie=UTF8&node=510114&page=22
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_23?ie=UTF8&node=510114&page=23
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_24?ie=UTF8&node=510114&page=24
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_25?ie=UTF8&node=510114&page=25
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_26?ie=UTF8&node=510114&page=26
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_27?ie=UTF8&node=510114&page=27
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_28?ie=UTF8&node=510114&page=28
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_29?ie=UTF8&node=510114&page=29
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_30?ie=UTF8&node=510114&page=30
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_31?ie=UTF8&node=510114&page=31
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_32?ie=UTF8&node=510114&page=32
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_33?ie=UTF8&node=510114&page=33
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_34?ie=UTF8&node=510114&page=34
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_35?ie=UTF8&node=510114&page=35
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_36?ie=UTF8&node=510114&page=36
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_37?ie=UTF8&node=510114&page=37
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_38?ie=UTF8&node=510114&page=38
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_39?ie=UTF8&node=510114&page=39
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_40?ie=UTF8&node=510114&page=40
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_41?ie=UTF8&node=510114&page=41
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_42?ie=UTF8&node=510114&page=42
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_43?ie=UTF8&node=510114&page=43
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_44?ie=UTF8&node=510114&page=44
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_45?ie=UTF8&node=510114&page=45
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_46?ie=UTF8&node=510114&page=46
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_47?ie=UTF8&node=510114&page=47
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_48?ie=UTF8&node=510114&page=48
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_49?ie=UTF8&node=510114&page=49
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_50?ie=UTF8&node=510114&page=50
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_51?ie=UTF8&node=510114&page=51
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_52?ie=UTF8&node=510114&page=52
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_53?ie=UTF8&node=510114&page=53
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_54?ie=UTF8&node=510114&page=54
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_55?ie=UTF8&node=510114&page=55
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_56?ie=UTF8&node=510114&page=56
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_57?ie=UTF8&node=510114&page=57
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_58?ie=UTF8&node=510114&page=58
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_59?ie=UTF8&node=510114&page=59
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_60?ie=UTF8&node=510114&page=60
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_61?ie=UTF8&node=510114&page=61
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_62?ie=UTF8&node=510114&page=62
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_63?ie=UTF8&node=510114&page=63
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_64?ie=UTF8&node=510114&page=64
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_65?ie=UTF8&node=510114&page=65
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_66?ie=UTF8&node=510114&page=66
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_67?ie=UTF8&node=510114&page=67
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_68?ie=UTF8&node=510114&page=68
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_69?ie=UTF8&node=510114&page=69
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_70?ie=UTF8&node=510114&page=70
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_71?ie=UTF8&node=510114&page=71
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_72?ie=UTF8&node=510114&page=72
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_73?ie=UTF8&node=510114&page=73
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_74?ie=UTF8&node=510114&page=74
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_75?ie=UTF8&node=510114&page=75
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_76?ie=UTF8&node=510114&page=76
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_77?ie=UTF8&node=510114&page=77
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_78?ie=UTF8&node=510114&page=78
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_79?ie=UTF8&node=510114&page=79
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_80?ie=UTF8&node=510114&page=80
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_81?ie=UTF8&node=510114&page=81
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_82?ie=UTF8&node=510114&page=82
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_83?ie=UTF8&node=510114&page=83
https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_84?ie=UTF8&node=510114&page=84

Next, we'd like to obtain all the "soups" of the vacuum type "handheld" and store them into a list


In [ ]:
def soupGenerator(URLs,maxretrytime=20):    

    soups=[]
    urlindex=0
    for URL in URLs:
        urlindex+=1
        print("urlindex=",urlindex)
        passed=False
        cnt=0    
        while(passed==False):
            cnt+=1
            print("iteration=",cnt)
            if(cnt>maxretrytime):
                raise Exception("Error from soupGenerator(url,maxretrytime=20)! Tried too many times but we are still blocked by Amazon.")
        
            try:
                with requests.Session() as session:
            
                    session.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"}            
                    r=session.get(URL)            
                
                    if (r.status_code==200):                
                        soup=BeautifulSoup(r.content,"lxml")
                        if("Robot Check" in soup.text):
                            print("we are blocked!")
                        else:
                            print("we are not blocked!")
                            soups.append(soup)
                            passed=True
                        
                    else:
                        print ("Connection failed. Reconnecting...")
            except:
                print("Error from soupGenerator(URLs,maxretrytime=20)! Probably due to connection time out")
                
    return soups

In [19]:
soups=soupGenerator(URLs,maxretrytime=20)


urlindex= 1
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 2
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 3
iteration= 1
we are not blocked!
urlindex= 4
iteration= 1
we are not blocked!
urlindex= 5
iteration= 1
we are not blocked!
urlindex= 6
iteration= 1
we are not blocked!
urlindex= 7
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 8
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are not blocked!
urlindex= 9
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are blocked!
iteration= 10
we are blocked!
iteration= 11
we are blocked!
iteration= 12
we are blocked!
iteration= 13
we are blocked!
iteration= 14
we are blocked!
iteration= 15
we are blocked!
iteration= 16
we are blocked!
iteration= 17
we are not blocked!
urlindex= 10
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 11
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 12
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are blocked!
iteration= 10
we are not blocked!
urlindex= 13
iteration= 1
we are not blocked!
urlindex= 14
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 15
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are not blocked!
urlindex= 16
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are not blocked!
urlindex= 17
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are not blocked!
urlindex= 18
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are not blocked!
urlindex= 19
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are not blocked!
urlindex= 20
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 21
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 22
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 23
iteration= 1
we are not blocked!
urlindex= 24
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 25
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 26
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are not blocked!
urlindex= 27
iteration= 1
we are not blocked!
urlindex= 28
iteration= 1
we are not blocked!
urlindex= 29
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are blocked!
iteration= 10
we are blocked!
iteration= 11
we are blocked!
iteration= 12
we are blocked!
iteration= 13
we are blocked!
iteration= 14
we are blocked!
iteration= 15
we are blocked!
iteration= 16
we are blocked!
iteration= 17
we are blocked!
iteration= 18
we are not blocked!
urlindex= 30
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are not blocked!
urlindex= 31
iteration= 1
we are not blocked!
urlindex= 32
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are not blocked!
urlindex= 33
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are not blocked!
urlindex= 34
iteration= 1
we are not blocked!
urlindex= 35
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are not blocked!
urlindex= 36
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 37
iteration= 1
we are not blocked!
urlindex= 38
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are not blocked!
urlindex= 39
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 40
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 41
iteration= 1
we are not blocked!
urlindex= 42
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 43
iteration= 1
we are not blocked!
urlindex= 44
iteration= 1
we are not blocked!
urlindex= 45
iteration= 1
we are not blocked!
urlindex= 46
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 47
iteration= 1
we are not blocked!
urlindex= 48
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 49
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 50
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 51
iteration= 1
we are not blocked!
urlindex= 52
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 53
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are not blocked!
urlindex= 54
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 55
iteration= 1
we are not blocked!
urlindex= 56
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are not blocked!
urlindex= 57
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 58
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 59
iteration= 1
we are not blocked!
urlindex= 60
iteration= 1
we are not blocked!
urlindex= 61
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 62
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 63
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are blocked!
iteration= 10
we are not blocked!
urlindex= 64
iteration= 1
we are not blocked!
urlindex= 65
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 66
iteration= 1
we are not blocked!
urlindex= 67
iteration= 1
we are not blocked!
urlindex= 68
iteration= 1
we are not blocked!
urlindex= 69
iteration= 1
we are not blocked!
urlindex= 70
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 71
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 72
iteration= 1
we are not blocked!
urlindex= 73
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 74
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are not blocked!
urlindex= 75
iteration= 1
we are blocked!
iteration= 2
we are not blocked!
urlindex= 76
iteration= 1
we are not blocked!
urlindex= 77
iteration= 1
we are not blocked!
urlindex= 78
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are blocked!
iteration= 10
we are blocked!
iteration= 11
we are blocked!
iteration= 12
we are blocked!
iteration= 13
we are not blocked!
urlindex= 79
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are blocked!
iteration= 5
we are blocked!
iteration= 6
we are blocked!
iteration= 7
we are blocked!
iteration= 8
we are blocked!
iteration= 9
we are blocked!
iteration= 10
we are blocked!
iteration= 11
we are blocked!
iteration= 12
we are blocked!
iteration= 13
we are blocked!
iteration= 14
we are blocked!
iteration= 15
we are not blocked!
urlindex= 80
iteration= 1
we are not blocked!
urlindex= 81
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!
urlindex= 82
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are blocked!
iteration= 4
we are not blocked!
urlindex= 83
iteration= 1
we are not blocked!
urlindex= 84
iteration= 1
we are blocked!
iteration= 2
we are blocked!
iteration= 3
we are not blocked!

How many soups have we created?


In [20]:
print(len(soups))


84

Let us pause for a while. We would like to review the usage of CSS selectors


In [181]:
example='''
<span class="abc">
  <div>
    <a href="http://123xyz.com"></a>
    hello_div01
  </div>
</span>

<span class="def">
  <a href="http://www.go.123xyz"></a>
  <div>hello_div02</div>
</span>
'''

In [182]:
mysoup=BeautifulSoup(example,"lxml")

In [183]:
print(mysoup.prettify())


<html>
 <body>
  <span class="abc">
   <div>
    <a href="http://123xyz.com">
    </a>
    hello_div01
   </div>
  </span>
  <span class="def">
   <a href="http://www.go.123xyz">
   </a>
   <div>
    hello_div02
   </div>
  </span>
 </body>
</html>

Exercise: look for a specific tag which is a descendent of some other tag


In [187]:
mysoup.select(".abc a")


Out[187]:
[<a href="http://123xyz.com"></a>]

In [288]:
mysoup.select(".abc > a")


Out[288]:
[]

the symbol > indicates that we'd like to look for a tags, which are direct descendents of the tag which its class=abc.

If we use ".abc a", it means that we would like to find all descendents of the tag which its class=abc.


In [289]:
mysoup.select(".abc > div")


Out[289]:
[<div>
 <a href="http://123xyz.com"></a>
     hello_div01
   </div>]

Exercise: we look for the tags whose value of the attr href starts with "http"


In [290]:
mysoup.select("a[href^='http']")


Out[290]:
[<a href="http://123xyz.com"></a>, <a href="http://www.go.123xyz"></a>]

Exercise: we look for the tags whose value of the attr href ends with "http"


In [291]:
mysoup.select("a[href$='http']")


Out[291]:
[]

Exercise: extract the value of a specific attr of a specific tag


In [185]:
mysoup.select(".abc a")[0]["href"]


Out[185]:
'http://123xyz.com'


In [581]:
sp=soups[70].select('li[id^="result_"]')[0]

print(sp)

for s in sp:
    try:
        print(sp.span)
    except:
        print("error")


<li class="s-result-item celwidget " data-asin="B00QBJ0NP6" id="result_1680"><div class="s-item-container"><div class="a-row sx-badge-region"><div class="a-row a-spacing-top-micro a-spacing-micro"><div class="a-row a-spacing-large"></div></div></div><div class="a-row a-spacing-base"><div aria-hidden="true" class="a-column a-span12 a-text-left"><div class="a-section a-spacing-none a-inline-block s-position-relative"><a class="a-link-normal a-text-normal" href="https://www.amazon.com/Mini-Vacuum-Cleaner-Household-Vacuuming/dp/B00QBJ0NP6/ref=lp_510114_1_1681/159-6918950-0010931?s=vacuums&amp;ie=UTF8&amp;qid=1485362182&amp;sr=1-1681"><img alt="Product Details" class="s-access-image cfMarker" data-search-image-load="" height="160" src="https://images-na.ssl-images-amazon.com/images/I/31z0+kMLpUL._AC_US160_.jpg" srcset="https://images-na.ssl-images-amazon.com/images/I/31z0+kMLpUL._AC_US160_.jpg 1x, https://images-na.ssl-images-amazon.com/images/I/31z0+kMLpUL._AC_US240_QL65_.jpg 1.5x, https://images-na.ssl-images-amazon.com/images/I/31z0+kMLpUL._AC_US320_QL65_.jpg 2x, https://images-na.ssl-images-amazon.com/images/I/31z0+kMLpUL._AC_US400_QL65_.jpg 2.5x, https://images-na.ssl-images-amazon.com/images/I/31z0+kMLpUL._AC_US480_QL65_.jpg 3x" width="160"/></a><div class="a-section a-spacing-none a-text-center"></div></div></div></div><div class="a-row a-spacing-mini"><div class="a-row a-spacing-none"><a class="a-link-normal s-access-detail-page a-text-normal" href="https://www.amazon.com/Mini-Vacuum-Cleaner-Household-Vacuuming/dp/B00QBJ0NP6/ref=lp_510114_1_1681/159-6918950-0010931?s=vacuums&amp;ie=UTF8&amp;qid=1485362182&amp;sr=1-1681" title="Mini Vacuum Cleaner - Set of 12, [Household Supplies, Vacuuming]"><h2 class="a-size-base a-color-null s-inline s-access-title color-variation-title-replacement a-text-normal" data-attribute="Mini Vacuum Cleaner - Set of 12, [Household Supplies, Vacuuming]" data-max-rows="0" data-truncate-by-character="false">Mini Vacuum Cleaner - Set of 12, [Household Supplies, Vacuuming]</h2></a></div><div class="a-row a-spacing-none"><span class="a-size-small a-color-secondary">by </span><span class="a-size-small a-color-secondary">StarSun Depot</span></div></div><div class="a-row a-spacing-mini"><div class="a-row a-spacing-none"><a class="a-link-normal a-text-normal" href="https://www.amazon.com/Mini-Vacuum-Cleaner-Household-Vacuuming/dp/B00QBJ0NP6/ref=lp_510114_1_1681/159-6918950-0010931?s=vacuums&amp;ie=UTF8&amp;qid=1485362182&amp;sr=1-1681"><span class="a-color-base"><span class="sx-price sx-price-large">
<sup class="sx-price-currency">$</sup>
<span class="sx-price-whole">37</span>
<sup class="sx-price-fractional">68</sup>
</span>
</span></a><span class="a-letter-space"></span><span class="a-size-base-plus a-color-secondary">+ $17.74 shipping</span></div><div class="a-row a-spacing-none"><div class="a-row a-spacing-none"><span class="a-size-small a-color-price">Only 1 left in stock - order soon.</span></div></div></div></div></li>
<span class="a-size-small a-color-secondary">by </span>

Let's go back.

First of all, let us look for the Product URL of the first item of the first page

print the link of the first page:


In [21]:
URLs=urlsGenerator(510114,FinalPageNum)
len(URLs)
print(URLs[0])
#for url in URLs:
#    print(url)


https://www.amazon.com/home-garden-kitchen-furniture-bedding/b/ref=sr_pg_1?ie=UTF8&node=510114&page=1

We found that the Product URL of the first item can be extracted via:


In [22]:
soups[0].select('li[id^="result_"]')[0].select("a[class='a-link-normal s-access-detail-page a-text-normal']")[0]


Out[22]:
<a class="a-link-normal s-access-detail-page a-text-normal" href="/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_browse_garden_sr_pg1_1?ie=UTF8&amp;adId=A02267761V6H01YO6L5KN&amp;url=https%3A%2F%2Fwww.amazon.com%2FDyson-V8-Absolute-Cord-Free-Vacuum%2Fdp%2FB01IENFJ14%2Fref%3Dlp_510114_1_1%2F163-0498112-6826524%3Fs%3Dvacuums%26ie%3DUTF8%26qid%3D1486295130%26sr%3D1-1-spons%26psc%3D1&amp;qualifier=1486295130&amp;id=8846538609981157&amp;widgetName=sp_atf_browse" title="Dyson V8 Absolute Cord-Free Vacuum"><h2 class="a-size-base a-color-null s-inline s-access-title color-variation-title-replacement a-text-normal" data-attribute="Dyson V8 Absolute Cord-Free Vacuum" data-max-rows="0" data-truncate-by-character="false">Dyson V8 Absolute Cord-Free Vacuum</h2></a>

where we have used the fact that each item has one unique id.


In [265]:
csrev_tag=soups[0].select('li[id^="result_"]')[0].select("a[href$='customerReviews']")[0]
print(csrev_tag)


<a class="a-size-small a-link-normal a-text-normal" href="https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1#customerReviews">10,106</a>

This means we are able to obtain the total number of customer reviews (10,106) and also the link of the selected item:

https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1

The above link will then be replaced by the following one:

which shows 50 customer reviews per page (instead of 10 reviews per page by default).

Another Goal: We'd like to obtain the price of the selected item

Now, let's look for more information, e.g. the price of the selected product. We know that the tag we have found is stored at the end part of a big tag which contains all the info of a specific item. Now, to retrieve more info of that item, we'll move ourselves from the end part to the front gradually.


In [266]:
csrev_tag.parent


Out[266]:
<div class="a-row a-spacing-none"><span name="B006LXOJC0">
<span class="a-declarative" data-a-popover='{"max-width":"700","closeButton":"false","position":"triggerBottom","url":"/review/widgets/average-customer-review/popover/ref=acr_search__popover?ie=UTF8&amp;asin=B006LXOJC0&amp;contextId=search&amp;ref=acr_search__popover"}' data-action="a-popover"><a class="a-popover-trigger a-declarative" href="javascript:void(0)"><i class="a-icon a-icon-star a-star-4-5"><span class="a-icon-alt">4.3 out of 5 stars</span></i><i class="a-icon a-icon-popover"></i></a></span></span>
<a class="a-size-small a-link-normal a-text-normal" href="https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1#customerReviews">10,106</a></div>

In [317]:
csrev_tag.parent.previous_sibling.previous_sibling


Out[317]:
<div class="a-row a-spacing-mini"><div class="a-row a-spacing-none"><a class="a-link-normal a-text-normal" href="https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1"><span class="a-color-base"><span class="sx-price sx-price-large">
<sup class="sx-price-currency">$</sup>
<span class="sx-price-whole">59</span>
<sup class="sx-price-fractional">99</sup>
</span>
</span></a><span class="a-letter-space"></span><span aria-label="Suggested Retail Price: $89.99" class="a-size-base-plus a-color-secondary a-text-strike">$89.99</span><span class="a-letter-space"></span><i aria-label="Prime" class="a-icon a-icon-prime a-icon-small s-align-text-bottom"><span class="a-icon-alt">Prime</span></i></div><div class="a-row a-spacing-none"><div class="a-row a-spacing-none"><span class="a-size-small a-color-secondary">Get it by <span class="a-color-success a-text-bold">Tomorrow, Jan 26</span></span></div></div><div class="a-row a-spacing-none"><div class="a-row a-spacing-mini"></div><span class="a-size-small a-color-secondary">More Buying Choices</span></div><div class="a-row a-spacing-none"><a class="a-size-small a-link-normal a-text-normal" href="https://www.amazon.com/gp/offer-listing/B006LXOJC0/ref=lp_510114_1_1_olp/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1&amp;condition=new"><span class="a-size-base a-color-base">$59.99</span><span class="a-letter-space"></span>new<span class="a-letter-space"></span><span class="a-color-secondary">(65 offers)</span><span class="a-letter-space"></span><span class="a-color-secondary a-text-strike"></span></a></div><div class="a-row a-spacing-none"><a class="a-size-small a-link-normal a-text-normal" href="https://www.amazon.com/gp/offer-listing/B006LXOJC0/ref=lp_510114_1_1_olp/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1&amp;condition=used"><span class="a-size-base a-color-base">$38.15</span><span class="a-letter-space"></span>used<span class="a-letter-space"></span><span class="a-color-secondary">(20 offers)</span><span class="a-letter-space"></span><span class="a-color-secondary a-text-strike"></span></a></div></div>

In [326]:
pricetag=csrev_tag.parent.previous_sibling.previous_sibling
price=pricetag.select(".sx-price-whole")[0].text
fraction_price=pricetag.select(".sx-price-fractional")[0].text
print(price,fraction_price)
print(int(price)+0.01*int(fraction_price))


59 99
59.99

so, we are able to obtain the price of the selected item.

Yet Another Goal: Let's see if we can obtain the brand of the selected item


In [333]:
pricetag.parent


Out[333]:
<div class="s-item-container"><div class="a-row sx-badge-region"><div class="a-row a-spacing-top-micro a-spacing-micro"><a class="a-size-small a-link-normal a-text-normal" href="/gp/bestsellers/home-garden/510114/ref=sr_bs_1_510114_1"><span class="aok-float-left sx-badge-rectangle sx-bestseller-color"><span class="sx-badge-text s-color-white">Best Seller</span></span><span class="aok-float-left sx-badge-triangle sx-bestseller-color"></span><span class="s-padding-left-mini s-hidden aok-relative sx-top-left-badge-tooltip sx-bestseller-node sx-bestseller-color">in Handheld Vacuums</span></a></div></div><div class="a-row a-spacing-base"><div aria-hidden="true" class="a-column a-span12 a-text-left"><div class="a-section a-spacing-none a-inline-block s-position-relative"><a class="a-link-normal a-text-normal" href="https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1"><img alt="Product Details" class="s-access-image cfMarker" data-search-image-load="" height="160" src="https://images-na.ssl-images-amazon.com/images/I/411HN6WBUxL._AC_US160_.jpg" srcset="https://images-na.ssl-images-amazon.com/images/I/411HN6WBUxL._AC_US160_.jpg 1x, https://images-na.ssl-images-amazon.com/images/I/411HN6WBUxL._AC_US240_QL65_.jpg 1.5x, https://images-na.ssl-images-amazon.com/images/I/411HN6WBUxL._AC_US320_QL65_.jpg 2x, https://images-na.ssl-images-amazon.com/images/I/411HN6WBUxL._AC_US400_QL65_.jpg 2.5x, https://images-na.ssl-images-amazon.com/images/I/411HN6WBUxL._AC_US480_QL65_.jpg 3x" width="160"/></a><div class="a-section a-spacing-none a-text-center"></div></div></div></div><div class="a-row a-spacing-mini"><div class="a-row a-spacing-none"><a class="a-link-normal s-access-detail-page a-text-normal" href="https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1" title="BLACK + DECKER CHV1410L 16V Cordless Lithium Hand Vac"><h2 class="a-size-base a-color-null s-inline s-access-title color-variation-title-replacement a-text-normal" data-attribute="BLACK + DECKER CHV1410L 16V Cordless Lithium Hand Vac" data-max-rows="0" data-truncate-by-character="false">BLACK + DECKER CHV1410L 16V Cordless Lithium Hand Vac</h2></a></div><div class="a-row a-spacing-none"><span class="a-size-small a-color-secondary">by </span><span class="a-size-small a-color-secondary">BLACK+DECKER</span></div></div><div class="a-row a-spacing-mini"><div class="a-row a-spacing-none"><a class="a-link-normal a-text-normal" href="https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1"><span class="a-color-base"><span class="sx-price sx-price-large">
<sup class="sx-price-currency">$</sup>
<span class="sx-price-whole">59</span>
<sup class="sx-price-fractional">99</sup>
</span>
</span></a><span class="a-letter-space"></span><span aria-label="Suggested Retail Price: $89.99" class="a-size-base-plus a-color-secondary a-text-strike">$89.99</span><span class="a-letter-space"></span><i aria-label="Prime" class="a-icon a-icon-prime a-icon-small s-align-text-bottom"><span class="a-icon-alt">Prime</span></i></div><div class="a-row a-spacing-none"><div class="a-row a-spacing-none"><span class="a-size-small a-color-secondary">Get it by <span class="a-color-success a-text-bold">Tomorrow, Jan 26</span></span></div></div><div class="a-row a-spacing-none"><div class="a-row a-spacing-mini"></div><span class="a-size-small a-color-secondary">More Buying Choices</span></div><div class="a-row a-spacing-none"><a class="a-size-small a-link-normal a-text-normal" href="https://www.amazon.com/gp/offer-listing/B006LXOJC0/ref=lp_510114_1_1_olp/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1&amp;condition=new"><span class="a-size-base a-color-base">$59.99</span><span class="a-letter-space"></span>new<span class="a-letter-space"></span><span class="a-color-secondary">(65 offers)</span><span class="a-letter-space"></span><span class="a-color-secondary a-text-strike"></span></a></div><div class="a-row a-spacing-none"><a class="a-size-small a-link-normal a-text-normal" href="https://www.amazon.com/gp/offer-listing/B006LXOJC0/ref=lp_510114_1_1_olp/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1&amp;condition=used"><span class="a-size-base a-color-base">$38.15</span><span class="a-letter-space"></span>used<span class="a-letter-space"></span><span class="a-color-secondary">(20 offers)</span><span class="a-letter-space"></span><span class="a-color-secondary a-text-strike"></span></a></div></div><div class="a-row a-spacing-top-mini a-spacing-mini"><div class="a-row a-spacing-none"><span class="a-size-small a-color-secondary">FREE Shipping on eligible orders</span></div></div><div class="a-row a-spacing-none"><span name="B006LXOJC0">
<span class="a-declarative" data-a-popover='{"max-width":"700","closeButton":"false","position":"triggerBottom","url":"/review/widgets/average-customer-review/popover/ref=acr_search__popover?ie=UTF8&amp;asin=B006LXOJC0&amp;contextId=search&amp;ref=acr_search__popover"}' data-action="a-popover"><a class="a-popover-trigger a-declarative" href="javascript:void(0)"><i class="a-icon a-icon-star a-star-4-5"><span class="a-icon-alt">4.3 out of 5 stars</span></i><i class="a-icon a-icon-popover"></i></a></span></span>
<a class="a-size-small a-link-normal a-text-normal" href="https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/dp/B006LXOJC0/ref=lp_510114_1_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1#customerReviews">10,106</a></div><div class="a-row a-spacing-top-mini a-spacing-mini"><a class="a-link-emphasis a-text-normal" href="https://www.amazon.com/dp/B01DAI5CF6/ref=lp_510114_ob_1/157-7476471-7904367?s=vacuums&amp;ie=UTF8&amp;qid=1485361951&amp;sr=1-1">See newer version</a></div></div>

In [335]:
pricetag.previous_sibling.parent.select(".a-size-small")[2].text


Out[335]:
'BLACK+DECKER'

Another goal: number of the average stars of the selected item


In [55]:
for j in range(30):
    try:
        #selected=soups[2].select('li[id^="result_"]')[j].select_one("span[class='a-declarative']")
        selected=soups[2].select('li[id^="result_"]')[j].select_one("i[class='a-icon a-icon-popover']").previous_sibling

        print(len(selected),selected.string.split(" ")[0])
    except:
        print("index= ",j,", 0 stars (no reviews yet)")


1 4.5
1 4.3
1 4.1
1 4.4
1 3.9
1 4.3
1 4.4
1 4.1
1 3.2
1 4.2
1 4.3
1 4
1 4.6
1 3.6
1 2.9
1 4.2
1 4.2
17 4
1 4.7
1 3.8
1 4.2
1 2.6
1 4.2
1 3.9
1 4.5
1 4.6
1 4.4
index=  27 , 0 stars (no reviews yet)
index=  28 , 0 stars (no reviews yet)
index=  29 , 0 stars (no reviews yet)

In [614]:
print(soups[10].select('li[id^="result_"]')[0].find_all("a")[2]["href"]) # 5stars (although only 2 reviews)


https://www.amazon.com/gp/offer-listing/B01HQK8IZA/ref=lp_510114_1_241_olp/157-4244468-6130167?s=vacuums&ie=UTF8&qid=1485361987&sr=1-241&condition=new

In [615]:
print(soups[12].select('li[id^="result_"]')[0].find_all("a")[2]["href"]) # 0 start (no customer reviews yet)


https://www.amazon.com/BLACK-DECKER-DustBuster-Cordless-Vacuum/dp/B00NCT8F0S/ref=lp_510114_1_289/157-4916605-7228009?s=vacuums&ie=UTF8&qid=1485361993&sr=1-289

Now we are ready to merge all the ingredients learned from above code blocks into one function


In [658]:
def items_info_extractor(soups):
    
    item_links=[]
    item_num_of_reviews=[]
    item_prices=[]
    item_names=[]
    item_ids=[]
    item_brands=[]
    item_avestars=[]
    
    for soup in soups:
        items=soup.select('li[id^="result_"]')

        for item in items:

            link_item=item.select("a[href$='customerReviews']")

            # ignore those items which contains 0 customer reviews. Those items are irrelevent to us.
            if (link_item !=[]):  

                price_tag=link_item[0].parent.previous_sibling.previous_sibling
                price_main_tag=price_tag.select(".sx-price-whole")
                price_fraction_tag=price_tag.select(".sx-price-fractional")

                link=link_item[0]["href"]

                # Ignore items which don't have normal price tags.
                # Those are items which are not sold by Amazon directly.
                # Also, remove those items which are ads (3 ads are shown in each page).
                if((price_main_tag !=[]) & (price_fraction_tag !=[]) & (link.endswith("spons#customerReviews") == False)):

                    # extract the item's name and ID from the obtained link
                    item_name=link.split("/")[3]
                    item_id=link.split("/")[5]
                    # replace the obtained link by the link that will lead to the customer reviews
                    base_url="https://www.amazon.com/"
                    link=base_url+item_name+"/product-reviews/"+item_id+"/ref=cm_cr_getr_d_paging_btm_" \
                                 +str(1)+"?ie=UTF8&pageNumber="+str(1)+"&reviewerType=all_reviews&pageSize=1000"

                    # obtain the price of the selected single item
                    price_main=price_main_tag[0].text
                    price_fraction=price_fraction_tag[0].text
                    item_price=int(price_main)+0.01*int(price_fraction)

                    # obtain the brand of the selected single item
                    item_brand=price_tag.parent.select(".a-size-small")[1].text
                    if(item_brand=="by "):
                        item_brand=price_tag.parent.select(".a-size-small")[2].text
                    # obtain the number of reviews of the selected single item
                    item_num_of_review=int(re.sub(",","",link_item[0].text))
                    
                    # obtain the averaged number of stars
                    starSelect=item.select_one("span[class='a-declarative']")
                    if((starSelect is None) or (starSelect.span is None)):  # there are no reviews yet (hence, we see no stars at all)
                        item_avestar=0
                    else:
                        item_avestar=starSelect.span.string.split(" ")[0]   # there are some reviews. So, we are able to extract the averaged number of stars
                    
                    # store the obtained variables into lists
                    item_links.append(link)
                    item_num_of_reviews.append(item_num_of_review)
                    item_prices.append(item_price)
                    item_names.append(item_name)
                    item_ids.append(item_id)
                    item_brands.append(item_brand)
                    item_avestars.append(item_avestar)
    return item_brands,item_ids,item_names,item_prices,item_num_of_reviews,item_links,item_avestars

In [659]:
item_brands,item_ids,item_names,item_prices,item_num_of_reviews,item_links,item_avestars=items_info_extractor(soups)

In [385]:
print(len(item_ids))
print(len(set(item_ids)))


387
387

In [386]:
print(len(item_names))
print(len(set(item_names)))


387
380

In [387]:
print(len(item_links))
print(len(set(item_links)))


387
387

The above results indicate that there are items that have the same product name but different links.

Cool. Let's find those products.


In [391]:
import collections
item_names_repeated=[]
for key in collections.Counter(item_names):
    if collections.Counter(item_names)[key]>1:
        print(key,collections.Counter(item_names)[key])
        item_names_repeated.append(key)
#print [item for item, count in collections.Counter(a).items() if count > 1]


ILIFE-Robotic-Cleaner-upgraded-Cleaning 2
Decker-Replacement-PHV1800-18-Volt-Pivoting 2
Shark-18V-Hand-Cordless-Vacuum 2
EcoGecko-Portable-Handheld-Mattress-Allergens 2
CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic 2
Wrapables-Animal-Mini-Tabletop-Vacuum 2
Decker-9-6-Volt-Cordless-Dustbuster-BDH9600CHV 2

In [392]:
print(item_names_repeated)


['ILIFE-Robotic-Cleaner-upgraded-Cleaning', 'Decker-Replacement-PHV1800-18-Volt-Pivoting', 'Shark-18V-Hand-Cordless-Vacuum', 'EcoGecko-Portable-Handheld-Mattress-Allergens', 'CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic', 'Wrapables-Animal-Mini-Tabletop-Vacuum', 'Decker-9-6-Volt-Cordless-Dustbuster-BDH9600CHV']

In [419]:
items_repeated=[]
for name,link,price,numrev in zip(item_names,item_links,item_prices,item_num_of_reviews):
    if name in item_names_repeated:
        #print(name,link,"\n")
        items_repeated.append((name,link,price,numrev))

sort a list with the method: sorted ( a "key" has to be given )


In [420]:
items_repeated=sorted(items_repeated, key=lambda x: x[0])

In [424]:
print("item name, item link, item price, total # of reviews of that item","\n")

for idx,(name,link,price,numrev) in enumerate(items_repeated):
    if((idx+1)%2==0):
        print(name,link,price,numrev,"\n")
    else:
        print(name,link,price,numrev)


item name, item link, item price, total # of reviews of that item 

CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic https://www.amazon.com/CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic/product-reviews/B00KASUEK8/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 32.99 4793
CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic https://www.amazon.com/CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic/product-reviews/B004412GTO/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 53.46 5088 

Decker-9-6-Volt-Cordless-Dustbuster-BDH9600CHV https://www.amazon.com/Decker-9-6-Volt-Cordless-Dustbuster-BDH9600CHV/product-reviews/B01JYUUQD2/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 44.24 4
Decker-9-6-Volt-Cordless-Dustbuster-BDH9600CHV https://www.amazon.com/Decker-9-6-Volt-Cordless-Dustbuster-BDH9600CHV/product-reviews/B016P851MW/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 32.98 5 

Decker-Replacement-PHV1800-18-Volt-Pivoting https://www.amazon.com/Decker-Replacement-PHV1800-18-Volt-Pivoting/product-reviews/B002TAVZIU/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 20.99 17
Decker-Replacement-PHV1800-18-Volt-Pivoting https://www.amazon.com/Decker-Replacement-PHV1800-18-Volt-Pivoting/product-reviews/B002TAXRCC/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 14.99 15 

EcoGecko-Portable-Handheld-Mattress-Allergens https://www.amazon.com/EcoGecko-Portable-Handheld-Mattress-Allergens/product-reviews/B004H6NO8S/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 39.28 1
EcoGecko-Portable-Handheld-Mattress-Allergens https://www.amazon.com/EcoGecko-Portable-Handheld-Mattress-Allergens/product-reviews/B004H6PBUM/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 57.75 1 

ILIFE-Robotic-Cleaner-upgraded-Cleaning https://www.amazon.com/ILIFE-Robotic-Cleaner-upgraded-Cleaning/product-reviews/B01DNMJVEW/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 144.99 13
ILIFE-Robotic-Cleaner-upgraded-Cleaning https://www.amazon.com/ILIFE-Robotic-Cleaner-upgraded-Cleaning/product-reviews/B01DU7MIPE/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 144.95 4 

Shark-18V-Hand-Cordless-Vacuum https://www.amazon.com/Shark-18V-Hand-Cordless-Vacuum/product-reviews/B00K99OZDA/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 65.71 13
Shark-18V-Hand-Cordless-Vacuum https://www.amazon.com/Shark-18V-Hand-Cordless-Vacuum/product-reviews/B00K947ILG/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 139.71 1 

Wrapables-Animal-Mini-Tabletop-Vacuum https://www.amazon.com/Wrapables-Animal-Mini-Tabletop-Vacuum/product-reviews/B000JKERWI/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 13.59 133
Wrapables-Animal-Mini-Tabletop-Vacuum https://www.amazon.com/Wrapables-Animal-Mini-Tabletop-Vacuum/product-reviews/B001O6SHD6/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 13.59 47 

What's found

  • Each of the 7 items above has two different links/IDs (probably due to different color or seller) and varying prices.

Now, let's try to merge the obtained data into pandas dataframe

Reference: http://pbpython.com/pandas-list-dict.html


In [661]:
for id in item_ids:
    if("B006LXOJC0" in id):
        print(id)


B006LXOJC0

In [664]:
df=pd.DataFrame.from_items([("pindex",item_ids),("type","handheld"),("pname",item_names),("brand",item_brands),("price",item_prices),("rurl",item_links),("totalRev",item_num_of_reviews),("avgStars",item_avestars)])

In [671]:
df.loc[:,["rurl","avgStars","totalRev"]]


Out[671]:
rurl avgStars totalRev
0 https://www.amazon.com/BLACK-DECKER-CHV1410L-Cordless-Lithium/product-reviews/B006LXOJC0/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.3 10106
1 https://www.amazon.com/Decker-BDH2000PL-Lithium-Vacuum-20-volt/product-reviews/B00IOEFBKS/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.4 5297
2 https://www.amazon.com/Decker-HHVI320JR02-Dustbuster-Cordless-Lithium/product-reviews/B01DAI5CF6/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.2 848
3 https://www.amazon.com/Black-Decker-HNV215B10-Compact-Lithium/product-reviews/B01BXBX6E6/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4 237
4 https://www.amazon.com/Dirt-Devil-SD20005RED-Scorpion-Handheld/product-reviews/B002D47XOM/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.3 2348
5 https://www.amazon.com/Dyson-V8-Absolute-Cord-Free-Vacuum/product-reviews/B01IENFJ14/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.6 447
6 https://www.amazon.com/Dyson-Motor-Head-Cord-free-Vacuum/product-reviews/B00SMLJQ72/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.4 785
7 https://www.amazon.com/Dyson-V6-Animal-Cord-free-Vacuum/product-reviews/B00SMLJQ7W/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.5 859
8 https://www.amazon.com/Bissell-Eraser-Handheld-Vacuum-33A1/product-reviews/B001EYFQ28/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.4 4236
9 https://www.amazon.com/Eureka-EasyClean-Corded-Hand-Held-71B/product-reviews/B0006HUYGM/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.2 7482
10 https://www.amazon.com/Shark-Cordless-Perfect-Hand-SV780/product-reviews/B0037HHFMO/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3.8 2704
11 https://www.amazon.com/CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic/product-reviews/B00KASUEK8/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 2.7 4793
12 https://www.amazon.com/Hair-Eraser-Cordless-Hand-Vacuum/product-reviews/B01E0472TI/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.2 389
13 https://www.amazon.com/Handheld-Cordless-Cleaner-Lithium-Cyclonic/product-reviews/B01MDT3U9N/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.7 27
14 https://www.amazon.com/Decker-HNV220BCZ01FF-Compact-Lithium-Vacuum/product-reviews/B01BXBX6CI/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.3 290
15 https://www.amazon.com/Shark-Rocket-Corded-Hand-HV292/product-reviews/B00P9Z36T8/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.4 233
16 https://www.amazon.com/BDH2000L-20-Volt-Lithium-Battery-Cordless/product-reviews/B008R3F0J8/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.2 886
17 https://www.amazon.com/Dyson-V6-Cord-Free-Vacuum/product-reviews/B00SMLJPIC/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.3 917
18 https://www.amazon.com/Dyson-Animal-Cordless-Certified-Refurbished/product-reviews/B01AVXFD6Q/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.2 146
19 https://www.amazon.com/BLACK-DECKER-BDH2020FLFH-Lithium-20-volt/product-reviews/B00JILGZOC/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.2 638
20 https://www.amazon.com/BISSELL-SPOTLIFTER-ESSENTIAL-RED-1719T/product-reviews/B00IRJ1BYU/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3.6 1311
21 https://www.amazon.com/Dirt-Devil-Cleaner-Handheld-M08230RED/product-reviews/B000050HCV/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4 708
22 https://www.amazon.com/Black-Decker-BDH9600CHV-Dustbuster-Cordless/product-reviews/B00ECM5RRI/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3 2850
23 https://www.amazon.com/PAV1200W-Cyclonic-Action-Automotive-Pivoting-Nose-Handheld/product-reviews/B001AQEQNA/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3.9 1945
24 https://www.amazon.com/Bissell-Aeroswift-Compact-Bagless-1009/product-reviews/B0007Z69G2/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.2 377
25 https://www.amazon.com/Stanley-Decker-FHV1200W-Cordless-Canister/product-reviews/B002FQJW4W/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3.8 1601
26 https://www.amazon.com/Decker-HHVI315JO42-Dustbuster-Cordless-Lithium/product-reviews/B01DAI5BZ2/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.3 163
27 https://www.amazon.com/Hoover-Cleaner-Cordless-Handheld-BH52120PC/product-reviews/B00SWCU1LG/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3.8 652
28 https://www.amazon.com/BLACK-DECKER-BDH2020FL-Lithium-Brush/product-reviews/B00CCYLBZ0/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4.4 131
29 https://www.amazon.com/Dirt-Devil-BD10025WX-Bagless-Cordless/product-reviews/B00489LHEM/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3.5 58
... ... ... ...
357 https://www.amazon.com/CleanWave-UV-C-Vacuum-Cleaner-Prevention/product-reviews/B008JGXHZW/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
358 https://www.amazon.com/Bionaire-Hand-Held-Turbo-Vac/product-reviews/B00440E50I/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3 2
359 https://www.amazon.com/HAMMER-Dirt-Devil-Scorpion-Filter/product-reviews/B0019L6YU4/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 2.7 4
360 https://www.amazon.com/Creative-Fashion-Keyboard-Desktop-Handheld/product-reviews/B00JWM9CHO/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 0 1
361 https://www.amazon.com/Vacuum-Company-Car-Cleaner-CV-LDA105/product-reviews/B01LVXEM9Y/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
362 https://www.amazon.com/Kole-Imports-CA026-Vacuum-Cleaner/product-reviews/B00DOTD7KY/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
363 https://www.amazon.com/Honeywell-H19003-Replacement-DustBuster-Cyclonic/product-reviews/B000UKQMLG/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
364 https://www.amazon.com/WackyVac-Wacky-Vac-Attachment/product-reviews/B000FMIRLM/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
365 https://www.amazon.com/Metrovac-Electra-Sweep-Power-Broom/product-reviews/B00JEMTJ66/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3 2
366 https://www.amazon.com/Iautomatic-Cleaner-Accessory-suction-1-25-Inch/product-reviews/B00GQ4LH06/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
367 https://www.amazon.com/Koblenz-HV-120-Corded-120-Volt-Hand/product-reviews/B00CDTPGVO/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4 1
368 https://www.amazon.com/Unilution-75210-White-Portable-Handheld-Mattress/product-reviews/B008CCQJUI/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 2 2
369 https://www.amazon.com/DECKER-CHV4800-BUSTER-CORDLESS-KITCHENWARE/product-reviews/B007DIT1KW/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
370 https://www.amazon.com/AUVWD2-Dirt-Magic-Vacuum-Cleaner/product-reviews/B005HP495A/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
371 https://www.amazon.com/EcoGecko-Portable-Handheld-Mattress-Allergens/product-reviews/B004H6PBUM/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
372 https://www.amazon.com/Panasonic-handheld-vacuum-cleaner-MC-D25CP-WA/product-reviews/B001SN8FAA/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
373 https://www.amazon.com/Royal-Appliance-3-089570-001-Replacement-Filter/product-reviews/B000SMOFHE/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
374 https://www.amazon.com/Atrix-VACAMEXP-Express-Handi-Vacuum/product-reviews/B01MQE6O7Y/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4 1
375 https://www.amazon.com/Pullman-Holt-86asb5d4c-Dry-Hepa-Gallon/product-reviews/B01FHCCPHQ/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
376 https://www.amazon.com/Black-Decker-VBBM10-Multi-Surface-Brush/product-reviews/B01DP73BR8/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
377 https://www.amazon.com/Kalorik-Artisan-Hand-Vacuum-Red/product-reviews/B01APER954/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
378 https://www.amazon.com/Black-Decker-cordless-cleaner-PV1220/product-reviews/B00HVDAUJA/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
379 https://www.amazon.com/Antique-Brass-Hod-Shovel-Set/product-reviews/B008749ATI/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
380 https://www.amazon.com/SANDHILL-Hearth-Country-Ash-Vacuum/product-reviews/B008747XSI/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
381 https://www.amazon.com/Metro-Vac-12V-Portable-Car/product-reviews/B0057ERKAW/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 4 1
382 https://www.amazon.com/Royal-Appliance-3-088570-001-Cordless-Filter/product-reviews/B0044UWGI6/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3.7 37
383 https://www.amazon.com/Metropolitian-Cleaner-Extension-Blaster-HNBRK-2/product-reviews/B003CJD96I/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 5 1
384 https://www.amazon.com/ReadiVac-39006-CompuVac-Vacuum-Cleaner/product-reviews/B000EOWA6Y/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 2.5 4
385 https://www.amazon.com/BLACK-DECKER-cyclone-buster-Z-ACV1205/product-reviews/B000BWUN8G/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 1 1
386 https://www.amazon.com/COLEMAN-POWERMATE-PMV6990-Cordless-Vacuum-Cleaner/product-reviews/B00007E7M0/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000 3 3

387 rows × 3 columns

Let's upload the obtained dataframe to MariaDB


In [466]:
from sqlalchemy import create_engine,Table,Column,Integer,String,MetaData,ForeignKey,Date
import pymysql

engine=create_engine("mysql+pymysql://semantic:GbwSq1RzFb@104.199.201.206:13606/Tests?charset=utf8",echo=False, encoding='utf-8')
conn = engine.connect()

df.to_sql(name='amzProd', con=conn, if_exists = 'append', index=False)
conn.close()


/usr/local/lib/python3.5/site-packages/pymysql/cursors.py:166: Warning: (1265, "Data truncated for column 'pname' at row 80")
  result = self._query(query)
/usr/local/lib/python3.5/site-packages/pymysql/cursors.py:166: Warning: (1265, "Data truncated for column 'brand' at row 313")
  result = self._query(query)

Alternatively, we can store the obtained dataframe into a csv file


In [469]:
df.to_csv("ProdInfo_handheld_26012017.csv", encoding="utf-8")

And load it:


In [474]:
pd.DataFrame.from_csv("ProdInfo_handheld_26012017.csv", encoding="utf-8")


Out[474]:
pindex type pname brand price rurl totalRev
0 B006LXOJC0 handheld BLACK-DECKER-CHV1410L-Cordless-Lithium BLACK+DECKER 59.99 https://www.amazon.com/BLACK-DECKER-CHV1410L-C... 10106
1 B00IOEFBKS handheld Decker-BDH2000PL-Lithium-Vacuum-20-volt BLACK+DECKER 68.99 https://www.amazon.com/Decker-BDH2000PL-Lithiu... 5297
2 B01DAI5CF6 handheld Decker-HHVI320JR02-Dustbuster-Cordless-Lithium BLACK+DECKER 35.09 https://www.amazon.com/Decker-HHVI320JR02-Dust... 848
3 B01BXBX6E6 handheld Black-Decker-HNV215B10-Compact-Lithium BLACK+DECKER 24.99 https://www.amazon.com/Black-Decker-HNV215B10-... 237
4 B002D47XOM handheld Dirt-Devil-SD20005RED-Scorpion-Handheld Dirt Devil 28.99 https://www.amazon.com/Dirt-Devil-SD20005RED-S... 2348
5 B01IENFJ14 handheld Dyson-V8-Absolute-Cord-Free-Vacuum Dyson 539.00 https://www.amazon.com/Dyson-V8-Absolute-Cord-... 447
6 B00SMLJQ72 handheld Dyson-Motor-Head-Cord-free-Vacuum Dyson 289.99 https://www.amazon.com/Dyson-Motor-Head-Cord-f... 785
7 B00SMLJQ7W handheld Dyson-V6-Animal-Cord-free-Vacuum Dyson 349.99 https://www.amazon.com/Dyson-V6-Animal-Cord-fr... 859
8 B001EYFQ28 handheld Bissell-Eraser-Handheld-Vacuum-33A1 Bissell 29.99 https://www.amazon.com/Bissell-Eraser-Handheld... 4236
9 B0006HUYGM handheld Eureka-EasyClean-Corded-Hand-Held-71B Eureka 37.99 https://www.amazon.com/Eureka-EasyClean-Corded... 7482
10 B0037HHFMO handheld Shark-Cordless-Perfect-Hand-SV780 SharkNinja 48.47 https://www.amazon.com/Shark-Cordless-Perfect-... 2704
11 B00KASUEK8 handheld CHV1510-Dustbuster-15-6-Volt-Cordless-Cyclonic BLACK+DECKER 32.99 https://www.amazon.com/CHV1510-Dustbuster-15-6... 4793
12 B01E0472TI handheld Hair-Eraser-Cordless-Hand-Vacuum Bissell 44.99 https://www.amazon.com/Hair-Eraser-Cordless-Ha... 389
13 B01MDT3U9N handheld Handheld-Cordless-Cleaner-Lithium-Cyclonic HoLife 59.99 https://www.amazon.com/Handheld-Cordless-Clean... 27
14 B01BXBX6CI handheld Decker-HNV220BCZ01FF-Compact-Lithium-Vacuum BLACK+DECKER 29.99 https://www.amazon.com/Decker-HNV220BCZ01FF-Co... 290
15 B00P9Z36T8 handheld Shark-Rocket-Corded-Hand-HV292 SharkNinja 81.34 https://www.amazon.com/Shark-Rocket-Corded-Han... 233
16 B008R3F0J8 handheld BDH2000L-20-Volt-Lithium-Battery-Cordless BLACK+DECKER 76.50 https://www.amazon.com/BDH2000L-20-Volt-Lithiu... 886
17 B00SMLJPIC handheld Dyson-V6-Cord-Free-Vacuum Dyson 299.00 https://www.amazon.com/Dyson-V6-Cord-Free-Vacu... 917
18 B01AVXFD6Q handheld Dyson-Animal-Cordless-Certified-Refurbished Dyson 319.00 https://www.amazon.com/Dyson-Animal-Cordless-C... 146
19 B00JILGZOC handheld BLACK-DECKER-BDH2020FLFH-Lithium-20-volt BLACK+DECKER 95.00 https://www.amazon.com/BLACK-DECKER-BDH2020FLF... 638
20 B00IRJ1BYU handheld BISSELL-SPOTLIFTER-ESSENTIAL-RED-1719T Bissell 49.00 https://www.amazon.com/BISSELL-SPOTLIFTER-ESSE... 1311
21 B000050HCV handheld Dirt-Devil-Cleaner-Handheld-M08230RED Dirt Devil 35.60 https://www.amazon.com/Dirt-Devil-Cleaner-Hand... 708
22 B00ECM5RRI handheld Black-Decker-BDH9600CHV-Dustbuster-Cordless BLACK+DECKER 29.69 https://www.amazon.com/Black-Decker-BDH9600CHV... 2850
23 B001AQEQNA handheld PAV1200W-Cyclonic-Action-Automotive-Pivoting-N... BLACK+DECKER 53.77 https://www.amazon.com/PAV1200W-Cyclonic-Actio... 1945
24 B0007Z69G2 handheld Bissell-Aeroswift-Compact-Bagless-1009 Bissell 69.99 https://www.amazon.com/Bissell-Aeroswift-Compa... 377
25 B002FQJW4W handheld Stanley-Decker-FHV1200W-Cordless-Canister BLACK+DECKER 54.27 https://www.amazon.com/Stanley-Decker-FHV1200W... 1601
26 B01DAI5BZ2 handheld Decker-HHVI315JO42-Dustbuster-Cordless-Lithium BLACK+DECKER 50.63 https://www.amazon.com/Decker-HHVI315JO42-Dust... 163
27 B00SWCU1LG handheld Hoover-Cleaner-Cordless-Handheld-BH52120PC Hoover 112.95 https://www.amazon.com/Hoover-Cleaner-Cordless... 652
28 B00CCYLBZ0 handheld BLACK-DECKER-BDH2020FL-Lithium-Brush BLACK+DECKER 133.56 https://www.amazon.com/BLACK-DECKER-BDH2020FL-... 131
29 B00489LHEM handheld Dirt-Devil-BD10025WX-Bagless-Cordless Dirt Devil 18.62 https://www.amazon.com/Dirt-Devil-BD10025WX-Ba... 58
... ... ... ... ... ... ... ...
357 B008JGXHZW handheld CleanWave-UV-C-Vacuum-Cleaner-Prevention Allergy Asthma Technology 106.10 https://www.amazon.com/CleanWave-UV-C-Vacuum-C... 1
358 B00440E50I handheld Bionaire-Hand-Held-Turbo-Vac Sensio Bionaire 37.07 https://www.amazon.com/Bionaire-Hand-Held-Turb... 2
359 B0019L6YU4 handheld HAMMER-Dirt-Devil-Scorpion-Filter Arm & Hammer 6.99 https://www.amazon.com/HAMMER-Dirt-Devil-Scorp... 4
360 B00JWM9CHO handheld Creative-Fashion-Keyboard-Desktop-Handheld Panda Superstore 21.32 https://www.amazon.com/Creative-Fashion-Keyboa... 1
361 B01LVXEM9Y handheld Vacuum-Company-Car-Cleaner-CV-LDA105 Vacuum Company 34.55 https://www.amazon.com/Vacuum-Company-Car-Clea... 1
362 B00DOTD7KY handheld Kole-Imports-CA026-Vacuum-Cleaner Kole Imports 16.00 https://www.amazon.com/Kole-Imports-CA026-Vacu... 1
363 B000UKQMLG handheld Honeywell-H19003-Replacement-DustBuster-Cyclonic Honeywell 12.79 https://www.amazon.com/Honeywell-H19003-Replac... 1
364 B000FMIRLM handheld WackyVac-Wacky-Vac-Attachment WackyVac 12.99 https://www.amazon.com/WackyVac-Wacky-Vac-Atta... 1
365 B00JEMTJ66 handheld Metrovac-Electra-Sweep-Power-Broom MetroVac 145.95 https://www.amazon.com/Metrovac-Electra-Sweep-... 2
366 B00GQ4LH06 handheld Iautomatic-Cleaner-Accessory-suction-1-25-Inch Iautomatic 3.99 https://www.amazon.com/Iautomatic-Cleaner-Acce... 1
367 B00CDTPGVO handheld Koblenz-HV-120-Corded-120-Volt-Hand Koblenz 50.98 https://www.amazon.com/Koblenz-HV-120-Corded-1... 1
368 B008CCQJUI handheld Unilution-75210-White-Portable-Handheld-Mattress Uniution Inc. 84.03 https://www.amazon.com/Unilution-75210-White-P... 2
369 B007DIT1KW handheld DECKER-CHV4800-BUSTER-CORDLESS-KITCHENWARE Black and Decker Office Products 61.00 https://www.amazon.com/DECKER-CHV4800-BUSTER-C... 1
370 B005HP495A handheld AUVWD2-Dirt-Magic-Vacuum-Cleaner B&F 17.13 https://www.amazon.com/AUVWD2-Dirt-Magic-Vacuu... 1
371 B004H6PBUM handheld EcoGecko-Portable-Handheld-Mattress-Allergens EcoGecko 57.75 https://www.amazon.com/EcoGecko-Portable-Handh... 1
372 B001SN8FAA handheld Panasonic-handheld-vacuum-cleaner-MC-D25CP-WA Panasonic 123.61 https://www.amazon.com/Panasonic-handheld-vacu... 1
373 B000SMOFHE handheld Royal-Appliance-3-089570-001-Replacement-Filter Royal Appliance 7.55 https://www.amazon.com/Royal-Appliance-3-08957... 1
374 B01MQE6O7Y handheld Atrix-VACAMEXP-Express-Handi-Vacuum Atrix 84.95 https://www.amazon.com/Atrix-VACAMEXP-Express-... 1
375 B01FHCCPHQ handheld Pullman-Holt-86asb5d4c-Dry-Hepa-Gallon Pullman-Holt 864.00 https://www.amazon.com/Pullman-Holt-86asb5d4c-... 1
376 B01DP73BR8 handheld Black-Decker-VBBM10-Multi-Surface-Brush BLACK+DECKER 29.99 https://www.amazon.com/Black-Decker-VBBM10-Mul... 1
377 B01APER954 handheld Kalorik-Artisan-Hand-Vacuum-Red Kalorik 86.94 https://www.amazon.com/Kalorik-Artisan-Hand-Va... 1
378 B00HVDAUJA handheld Black-Decker-cordless-cleaner-PV1220 BLACK+DECKER 146.43 https://www.amazon.com/Black-Decker-cordless-c... 1
379 B008749ATI handheld Antique-Brass-Hod-Shovel-Set SANDHILL 105.79 https://www.amazon.com/Antique-Brass-Hod-Shove... 1
380 B008747XSI handheld SANDHILL-Hearth-Country-Ash-Vacuum SANDHILL 182.99 https://www.amazon.com/SANDHILL-Hearth-Country... 1
381 B0057ERKAW handheld Metro-Vac-12V-Portable-Car MetroVac 72.46 https://www.amazon.com/Metro-Vac-12V-Portable-... 1
382 B0044UWGI6 handheld Royal-Appliance-3-088570-001-Cordless-Filter Royal Appliance/Tti 33.01 https://www.amazon.com/Royal-Appliance-3-08857... 37
383 B003CJD96I handheld Metropolitian-Cleaner-Extension-Blaster-HNBRK-2 Metropolitian 55.63 https://www.amazon.com/Metropolitian-Cleaner-E... 1
384 B000EOWA6Y handheld ReadiVac-39006-CompuVac-Vacuum-Cleaner ReadiVac 21.99 https://www.amazon.com/ReadiVac-39006-CompuVac... 4
385 B000BWUN8G handheld BLACK-DECKER-cyclone-buster-Z-ACV1205 BLACK+DECKER 53.32 https://www.amazon.com/BLACK-DECKER-cyclone-bu... 1
386 B00007E7M0 handheld COLEMAN-POWERMATE-PMV6990-Cordless-Vacuum-Cleaner Team Products 78.00 https://www.amazon.com/COLEMAN-POWERMATE-PMV69... 3

387 rows × 7 columns

Upload the obtained CSV files to the remote MariaDB


In [496]:
from sqlalchemy import create_engine,Table,Column,Integer,String,MetaData,ForeignKey,Date
import pymysql
import datetime

I found out that there might be same pindex in one dataframe. This can lead to an error if we are going to upload our data to MariaDB, as the primary key is ought to be unique.


In [531]:
pd.set_option('max_colwidth', 800)
for idx,df in enumerate(dfs):
    print(idx,df.loc[df['pindex'] == 'B00SWGVICS'])


0 Empty DataFrame
Columns: [pindex, type, pname, brand, price, rurl, totalRev]
Index: []
1         pindex      type                                      pname  \
34  B00SWGVICS  canister  Hoover-Commercial-CH32008-Canister-Vacuum   

                brand  price  \
34  Hoover Commercial  141.0   

                                                                                                                                                                                     rurl  \
34  https://www.amazon.com/Hoover-Commercial-CH32008-Canister-Vacuum/product-reviews/B00SWGVICS/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000   

    totalRev  
34        54  
2         pindex      type                                      pname  \
68  B00SWGVICS  handheld  Hoover-Commercial-CH32008-Canister-Vacuum   

                brand  price  \
68  Hoover Commercial  141.0   

                                                                                                                                                                                     rurl  \
68  https://www.amazon.com/Hoover-Commercial-CH32008-Canister-Vacuum/product-reviews/B00SWGVICS/ref=cm_cr_getr_d_paging_btm_1?ie=UTF8&pageNumber=1&reviewerType=all_reviews&pageSize=1000   

    totalRev  
68        54  
3 Empty DataFrame
Columns: [pindex, type, pname, brand, price, rurl, totalRev]
Index: []
4 Empty DataFrame
Columns: [pindex, type, pname, brand, price, rurl, totalRev]
Index: []
5 Empty DataFrame
Columns: [pindex, type, pname, brand, price, rurl, totalRev]
Index: []
6 Empty DataFrame
Columns: [pindex, type, pname, brand, price, rurl, totalRev]
Index: []

Strategy: Store all csvs into one dataframe. Then, remove all duplicates before uploading to the DataBase.


In [42]:
import os
from IPython.display import display

In [32]:
cwd=os.getcwd()

In [33]:
print(cwd)


/Users/chweng/Google Drive/SemanticProj/webCrawler

Now, it's time to get to know the Pandas Dataframe better. I'd like to figure out how two dataframes can be merged horizontally.

an one column example: pd.Dataframe.from_items()


In [58]:
test_col = pd.DataFrame.from_items([("test_column1",np.arange(10))])
test_col2 = pd.DataFrame.from_items([("test_column2",5+np.arange(10))])
display(test_col,test_col2)


test_column1
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
test_column2
0 5
1 6
2 7
3 8
4 9
5 10
6 11
7 12
8 13
9 14

In [59]:
result = pd.concat([test_col, test_col2], axis=1)

In [60]:
display(result)


test_column1 test_column2
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
5 5 10
6 6 11
7 7 12
8 8 13
9 9 14


In [7]:
date="2017-02-01"
prodTypes=["central","canister","handheld","robotic","stick","upright","wetdry"]

# put all the dataframes into a list
dfs=[pd.DataFrame.from_csv("data/ProdInfo_%s_%s.csv"%(prodType,date), encoding="utf-8") for prodType in prodTypes]


for idx,df in enumerate(dfs):
    cID=[j%7 for j in range(df.shape[0])]
    colCID=pd.DataFrame.from_items([( "cID",cID )])
    dfs[idx]=pd.concat([df, colCID], axis=1)

# concatenate dataframes
df=pd.concat(dfs).drop_duplicates("rurl")

df.to_csv("ProdInfo_all_%s.csv"%(date), encoding="utf-8")

In [5]:
date="2017-02-01"
date="2017-02-06"
prodTypes=["central","canister","handheld","robotic","stick","upright","wetdry"]

# put all the dataframes into a list
dfs=[pd.DataFrame.from_csv("data/ProdInfo_%s_%s.csv"%(prodType,date), encoding="utf-8") for prodType in prodTypes]


for idx,df in enumerate(dfs):
    cID=[j%7 for j in range(df.shape[0])]
    colCID=pd.DataFrame.from_items([( "cID",cID )])
    dfs[idx]=pd.concat([df, colCID], axis=1)

# concatenate dataframes
df=pd.concat(dfs).drop_duplicates("rurl")

# prepare the connection and connect to the DB
engine=create_engine("mysql+pymysql://semantic:GbwSq1RzFb@104.199.201.206:13606/Tests?charset=utf8",echo=False, encoding='utf-8')
conn = engine.connect()

# remove duplicates and upload the concatenated dataframe to the SQL DataBase
df.to_sql(name='amzProd', con=conn, if_exists = 'append', index=False)

# close the connection
conn.close()

In [111]:
len(df.iloc[974]["brand"])


Out[111]:
60

In [540]:
df.iloc[463]["pname"]


Out[540]:
'Handheld-Vacuum-Cleaner-Abask-Vacuum-Cleaner-7-2V-60W-Ni-CD2200MA-3-5KPA-Suction-Portable-1-Accessories-Rechargeable-Cordless-Cleaner'

In [543]:
!echo "Handheld-Vacuum-Cleaner-Abask-Vacuum-Cleaner-7-2V-60W-Ni-CD2200MA-3-5KPA-Suction-Portable-1-Accessories-Rechargeable-Cordless-Cleaner"| wc


       1       1     134

Length of this string is larger than 100. Therefore, I have to alter our schema, since the product name was set to have length 100 by default.