In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
In [2]:
class ProductListing:
def __init__(self, soup):
self.soup = soup
def name(self):
return self.soup.a.string
def link(self):
return self.soup.a.get("href")
def itemId(self):
return self.soup.a.get("href").split('/')[-1]
def rating(self):
return self.soup.find('span', class_='bv-text-link').string
def price(self):
return self.soup.find("span", class_="actual-price").meta.string.strip()
In [3]:
# setup chrome webdriver
path_to_chromedriver = '/Users/csiu/lib/chromedriver'
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
In [4]:
# get info
url = 'http://www.gouletpens.com/noodlers-bottled-ink/c/128/?pageSize=500'
browser.get(url)
In [5]:
element = browser.find_elements_by_xpath('//div[@class="mz-productlisting-info"]')
time.sleep(10) ## to give the browser time for js to generate content (?)
In [6]:
data = {
"name":[],
"itemId":[],
"rating":[],
"price":[]
}
for e in element:
htmlData = e.get_attribute('innerHTML')
soup = BeautifulSoup(htmlData, 'html.parser')
pl = ProductListing(soup)
data["name"].append(pl.name())
data["itemId"].append(pl.itemId())
data["rating"].append(pl.rating())
data["price"].append(pl.price())
Top 10: Highest rated Noodler's ink products from Goulet Pens
In [7]:
df = pd.DataFrame(data)
df.sort(columns="rating", ascending=False, inplace=True)
df.head(10)
Out[7]:
In [8]:
df.to_csv("goulet-ranks.csv", index=False)