For Loop



In [ ]:

    
for i in range (1 ,1000):
    pass



In [ ]:

    
for number in number_list:
    pass



In [ ]:

Try Eecept



In [ ]:

    
try:
    pass

except IndexError:
    pass

except AttributeError:
    pass

except KeyError:  
    pass

except
    pass



In [ ]:

    
try:
    #write your codes down here!
    pass

except IndexError:
    #write codes that how to handle this error 
    #your codes down here!
    #or just let it print(next line)
    print('[ERROR] IndexError')
    pass

#AttributeError is often happend when NoneTypeException
#['NoneType' object has no attribute 'path'](as NullPointerException in Java)
except AttributeError:
    #write codes that how to handle this error 
    #your codes down here!
    #or just let it print(next line)
    print('[ERROR] AttributeError')
    pass

#KeyError whenever a dict() object is requested (using the format a = adict[key])
#and the key is not in the dictionary.
except KeyError:  
    print('[ERROR] KeyError')
    pass



In [ ]:

While Loop



In [ ]:

    
while(True):        
    try:
        pass
            
    except:
        break



In [ ]:

    
while(a != 0):        
    try:
        pass
            
    except:
        pass



In [ ]:

爬蟲架構

基本該import的東西



In [ ]:

    
import requests as rq
from bs4 import BeautifulSoup as bs
from collections import OrderedDict as od
from selenium import webdriver
from datetime import datetime
import time
import random
import json
import csv
import traceback as tb
import re

以下參考架構就好

上層爬蟲



In [ ]:

    
def famicloud_crawler():
    res_index = rq.get(index_url)
    soup_index = bs(res_index.text, 'lxml')
    soup_li = soup_index.li
    
    
    for i in range (1 ,1000):
        #交給中層爬蟲
        #small_branch_crawler(small_branch_url)
        pass

中層爬蟲(範例使用selenium 中的 webdriver，需要渲染 > 很慢)



In [ ]:

    
def small_branch_crawler(small_branch_url):
    driver = webdriver.PhantomJS(executable_path='C:/Users/nick800608/phantomjs-2.1.1-windows/bin/phantomjs')
    driver.get(small_branch_url)
    pageSource = driver.page_source    
    soup = bs(pageSource, 'lxml')
    
    for a_href in a_href_list:
        #交給下層爬蟲        
        #product_page_crawler(product_url)
        pass

下層爬蟲



In [ ]:

    
def product_page_crawler(product_url):
    res = rq.get(product_url)
    soup = bs(res.text, 'lxml')