For Loop


In [ ]:
for i in range (1 ,1000):
    pass

In [ ]:
for number in number_list:
    pass

In [ ]:

Try Eecept


In [ ]:
try:
    pass

except IndexError:
    pass

except AttributeError:
    pass

except KeyError:  
    pass

except
    pass

In [ ]:
try:
    #write your codes down here!
    pass

except IndexError:
    #write codes that how to handle this error 
    #your codes down here!
    #or just let it print(next line)
    print('[ERROR] IndexError')
    pass

#AttributeError is often happend when NoneTypeException
#['NoneType' object has no attribute 'path'](as NullPointerException in Java)
except AttributeError:
    #write codes that how to handle this error 
    #your codes down here!
    #or just let it print(next line)
    print('[ERROR] AttributeError')
    pass

#KeyError whenever a dict() object is requested (using the format a = adict[key])
#and the key is not in the dictionary.
except KeyError:  
    print('[ERROR] KeyError')
    pass

In [ ]:

While Loop


In [ ]:
while(True):        
    try:
        pass
            
    except:
        break

In [ ]:
while(a != 0):        
    try:
        pass
            
    except:
        pass

In [ ]:

爬蟲架構

基本該import的東西


In [ ]:
import requests as rq
from bs4 import BeautifulSoup as bs
from collections import OrderedDict as od
from selenium import webdriver
from datetime import datetime
import time
import random
import json
import csv
import traceback as tb
import re

以下參考架構就好

上層爬蟲


In [ ]:
def famicloud_crawler():
    res_index = rq.get(index_url)
    soup_index = bs(res_index.text, 'lxml')
    soup_li = soup_index.li
    
    
    for i in range (1 ,1000):
        #交給中層爬蟲
        #small_branch_crawler(small_branch_url)
        pass

中層爬蟲(範例使用selenium 中的 webdriver,需要渲染 > 很慢)


In [ ]:
def small_branch_crawler(small_branch_url):
    driver = webdriver.PhantomJS(executable_path='C:/Users/nick800608/phantomjs-2.1.1-windows/bin/phantomjs')
    driver.get(small_branch_url)
    pageSource = driver.page_source    
    soup = bs(pageSource, 'lxml')
    
    for a_href in a_href_list:
        #交給下層爬蟲        
        #product_page_crawler(product_url)
        pass

下層爬蟲


In [ ]:
def product_page_crawler(product_url):
    res = rq.get(product_url)
    soup = bs(res.text, 'lxml')