In [ ]:
# https://datasciencedojo.com/web-scraping-30-minutes/
# https://vimeo.com/209499033

In [1]:
from IPython.display import HTML

# Vimeo
HTML('<iframe src="https://player.vimeo.com/video/209499033?title=0&byline=0&portrait=0" width="700" height="394" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen>')

# Youtube
#HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/S_f2qV2_U00?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')


Out[1]:

In [47]:
# following along with tutorial in the video above
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38'

# opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

# html parsing
page_soup = soup(page_html, "html.parser")

# just checking if the data is in place and if I can read the header one
page_soup.h1
# or maybe the p tag
page_soup.p
page_soup.body.span

# use inspect in the browser to find the tag with the info on grapic cards
# first I find how to get the info on one card (the whole container), then I loop trough all the others
containers = page_soup.findAll('div', {'class':'item-container'})
# how many containers did we find?
'''len(containers)'''
# look at the first one
'''containers[0]'''

# to understand better - paste the output from container 1 into jsbeautifier.org
# follow the prettified html to the data I want to extract
'''container = containers[0]'''
# in this case the name of the board is in the image title within the a tag within div within the first div...
'''container.div.div.a.img['title']'''

# this is the last step - throw everything into a csv
filename = 'products.csv'
f = open(filename, 'w') # w for write
headers = 'brand, product_name, shipping\n'
f.write(headers)

# now use what we did to create a loop
for container in containers:
    brand = container.div.div.a.img['title']
    
    title_container = container.findAll('a', {'class' : 'item-title'})
    product_name = title_container[0].text
    
    shipping_container = container.findAll('li', {'class' : 'price-ship'})
    shipping = shipping_container[0].text.strip()
    
    print('brand: ' + brand)
    print('product_name: ' + product_name)
    print('shipping: ' + shipping)
    
    f.write(brand + ',' + product_name.replace(',', '|') + ',' + shipping + '\n') # replace commas in the text with pipes
    
f.close() # if we don't close the file we can't open it...


brand: GIGABYTE
product_name: GIGABYTE GeForce GTX 1050 Ti DirectX 12 GV-N105TWF2OC-4GD Video Card
shipping: $4.99 Shipping
brand: EVGA
product_name: EVGA GeForce GTX 1060 GAMING DirectX 12 06G-P4-6262-KR Video Card
shipping: $4.99 Shipping
brand: ASUS
product_name: ASUS ROG GeForce GTX 1070 STRIX-GTX1070-O8G-GAMING Video Card with RGB Lighting
shipping: $3.99 Shipping
brand: MSI
product_name: MSI GeForce GTX 1070 DirectX 12 GeForce GTX 1070 Quick Silver 8G OC Video Card
shipping: $3.99 Shipping
brand: XFX
product_name: XFX Radeon RX 550 DirectX 12 RX-550P4TFG5 Video Card
shipping: $3.99 Shipping
brand: ZOTAC
product_name: ZOTAC GeForce GTX 1080 Ti AMP Edition 11GB GDDR5X 352-bit Gaming Graphics Card VR Ready 16+2 Power Phase Freeze Fan Stop IceStorm Cooling Spectra Lighting ZT-P10810D-10P
shipping: Free Shipping
brand: EVGA
product_name: EVGA GeForce GTX 1070 SC GAMING ACX 3.0 Black Edition, 08G-P4-5173-KR, 8GB GDDR5, LED, DX12 OSD Support (PXOC)
shipping: $4.99 Shipping
brand: GIGABYTE
product_name: GIGABYTE Radeon RX 460 DirectX 12 GV-RX460WF2OC-4GD Video Card
shipping: Free Shipping
brand: XFX
product_name: XFX Radeon RX 460 DirectX 12 RX-460P4DFG5 Video Card
shipping: $3.99 Shipping
brand: MSI
product_name: MSI Radeon RX 550 DirectX 12 RX 550 AERO ITX 2G O 2GB 128-Bit GDDR5 PCI Express x16 (uses x8) HDCP Ready Video Card
shipping: $3.99 Shipping
brand: ZOTAC
product_name: ZOTAC GeForce GTX 1080 Ti AMP Extreme 11GB GDDR5X 352-bit Gaming Graphics Card VR Ready 16+2 Power Phase Freeze Fan Stop IceStorm Cooling Spectra Lighting ZT-P10810C-10P
shipping: $4.99 Shipping
brand: ASUS
product_name: ASUS GeForce GTX 1060 DUAL-GTX1060-O3G Video Card
shipping: $3.99 Shipping

In [48]:
import pandas as pd
df_products = pd.read_csv('products.csv')
df_products.head()


Out[48]:
brand product_name shipping
0 GIGABYTE GIGABYTE GeForce GTX 1050 Ti DirectX 12 GV-N10... $4.99 Shipping
1 EVGA EVGA GeForce GTX 1060 GAMING DirectX 12 06G-P4... $4.99 Shipping
2 ASUS ASUS ROG GeForce GTX 1070 STRIX-GTX1070-O8G-GA... $3.99 Shipping
3 MSI MSI GeForce GTX 1070 DirectX 12 GeForce GTX 10... $3.99 Shipping
4 XFX XFX Radeon RX 550 DirectX 12 RX-550P4TFG5 Vide... $3.99 Shipping

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: