In [1]:

    
# ! /usr/bin/env python
# -*- coding=utf-8 -*-

import urllib
import urllib2
from urllib3.contrib import pyopenssl
import requests
import re,random
from multiprocessing.dummy import Pool as ThreadPool 
import re
#import urllib.request
#import urllib.error
import gzip
from bs4 import BeautifulSoup, element
#import mysql.connector
from collections import namedtuple
import os
import platform
import logging
import subprocess
import time
from datetime import datetime

time_out = 10 # 全局变量 10 秒超时时间
count = 0
proxies = [None]

Proxy = namedtuple("Proxy", ["ip", "port", "addtime", "wspeed", "lspeed", "country", "anonymity", "linuxping", "winping"])

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
             'Accept':'text/html;q=0.9,*/*;q=0.8',
             'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
             #'Accept-Encoding':'gzip',
             'Connection':'close',
             'Referer':None #注意如果依然不能抓取的话，这里可以设置抓取网站的host
             }



def get_proxy():
    # 使用全局变量,修改之
    global proxies
    '''
    try:
        url = 'http://www.xicidaili.com/'
        #url = 'http://www.gatherproxy.com/proxylist/anonymity/?t=Elite'
        #url = 'https://free-proxy-list.net/anonymous-proxy.html'
        req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
             'Accept':'text/html;q=0.9,*/*;q=0.8',
             'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
             #'Accept-Encoding':'gzip',
             'Connection':'close',
             'Referer':None #注意如果依然不能抓取的话，这里可以设置抓取网站的host
             }
        req_timeout = 10
        req = urllib2.Request(url,None,req_header)
        resp = urllib2.urlopen(req,None,req_timeout)
        html = resp.read()
     
        #req.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 
        #  (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36")

        response = urllib2.urlopen(req)
        the_page = response.read()
        #req = urllib2.urlopen('http://www.xicidaili.com/',None,headers)
        #req = urllib2.Request("http://www.xicidaili.com/", headers=headers)
        #req  = urllib2.urlopen('http://www.xicidaili.com/')
        #print("===== start to get proxy list",req.read())
        
    except urllib2.URLError,e:
        print('无法获取代理信息!')
        print e.reason
        print e.reason[0]
        print e.reason[1]
        return
    
    
    response =urllib2.urlopen(req)
    print ("===== content-type : ",response.headers['content-type'] )
    print ("===== headers.getparam('charset') : ",response.headers.getparam('charset') )
    
    result = response.read().decode('utf-8')
    #html = response.read().decode('utf-8')
   
    #print("===== get proxy page ,type : ",result,type(result) 
    '''
    url ='https://free-proxy-list.net/anonymous-proxy.html'
    
    req = requests.get(url,headers=headers)

    #req=requests.post(url,headers=headers)

    #req.prepare().body
    #req.prepare().headers
    #url_result = requests.get(url)
    #print("===== req:,type ",req.text,type(req))
    #soup = BeautifulSoup(requests.get(url).content)
    soup = BeautifulSoup(req.text, "html.parser")
    proxylist = []
    print("begin getProxy")
    try:
        if url == "https://free-proxy-list.net/anonymous-proxy.html":
            tbody = soup.tbody
            #print("===== tbody:,type ",tbody,type(tbody))
            for child in tbody.children:
                if isinstance(child, element.Tag) and len(child) > 1:
                    # ip
                    tp = child.contents[0]
                    ip = str(tp.contents[0].string)
                    # port
                    try:
                        tp = child.contents[1]
                        port = str(tp.contents[0].string)
                    except IndexError as e:
                        logging.exception(e)
                        port = str(tp.string).strip()
                        print(port)
                    # time
                    addtime = datetime.utcnow()
                    # speed
                    wspeed = 9999
                    lspeed = 9999
                    # country
                    tp = child.contents[3]
                    country = str(tp.contents[0].strip())
                    # anonymity
                    tp = child.contents[4]
                    anonymity = str(tp.contents[0].string).lower()
                    winping = 9999
                    linuxping = 9999
                    proxy_item = Proxy(ip, port, addtime, wspeed, lspeed, country, anonymity, linuxping, winping)
                    proxylist.append(proxy_item)
                    #print("===== proxylist:,type ",proxylist,type(proxylist))
                    logging.info(proxy_item)
        else:
            pass
        print("finish getProxy")
    except Exception as e:
        logging.exception(e)
    
    #p = re.compile(r'''<tr\sclass[^>]*>\s+<td\s.*</td>\s+<td>(.*)?</td>\s+<td>(.*)?</td>\s+<td>.*</td>\s+<td.*</td>\s+<td>(.*)?</td>\s+<td>.*</td>\s+<td>.*</td>\s+</tr>''',re.VERBOSE)
    #proxy_list = p.findall(result)
    
    #proxy_list = re.findall(r'<tr class="odd"><</tr>',result, re.M|re.I|re.S)
    
    
    #print("===== get proxy page proxy_list:,type ",proxy_list,type(proxy_list))
    proxies.append('221.211.193.51:80')
    proxies.append('120.7.84.59:8118')
    proxies.append('203.91.121.76:3128')
    proxies.append('180.168.179.193:8080')
    for each_proxy in proxylist[1:]:
        proxies.append(each_proxy[0]+':'+each_proxy[1])
    print("===== proxies:,type ",proxies,type(proxies))  
    
def change_proxy():
    # 随机从序列中取出一个元素
    proxy = random.choice(proxies)
    # 判断元素是否合理
    if proxy == None:
        proxy_support =urllib2.ProxyHandler({})
    else:
        proxy_support = urllib2.ProxyHandler({'http':proxy})
    opener = urllib2.build_opener(proxy_support)
    opener.addheaders = [('User-Agent',headers['User-Agent'])]
    urllib2.install_opener(opener)
    print('智能切换代理：%s' % ('本机' if proxy==None else proxy))
def get_req(url):
    # 先伪造一下头部吧,使用字典
    blog_eader = {
                'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36',
                'Host':'blog.csdn.net',
                'Referer':'http://blog.csdn.net/',
                'GET':url
                } 
    #req = urllib2.urlopen(url,headers = blog_eader)
    req = urllib2.Request(url, headers=headers)
    return req
# 访问 博客
def look_blog(url):
    # 切换一下IP
    change_proxy()
    req = get_req(url)
    try:
        look_response = urllib2.urlopen(req,timeout = time_out)
    except:
        return
    else:
        print('Access Successfully !')
        #print ('Look result :',look_response.read())
        #return look_response 
# 迭代访问
def click_blog(url):
    for i in range(0,count):
        if(i == count):
            break
        print('当前访问 Blog %s 第 %d 次' % (url,i))
        look_blog(url)

if __name__ == '__main__':
    global count
    # 基本参数初始化
    # 获取代理
    print("===initialization")
    get_proxy()
    print('有效代理个数为 : %d' % len(proxies))
    #blogurl = input('输入blog链接:')
    # 这个地方原本是我的默认输入偷懒用的
    blogurl = 'https://youtu.be/RysyB_Zcjlo'
    #blogurl = 'http://icanhazip.com/'
    if len(blogurl) == 0:
        #blogurl = 'http://blog.csdn.net/bkxiaoc/'
        blogurl = 'https://youtu.be/RysyB_Zcjlo'
        
    print('URL is :' ,blogurl)
    try:
        count = int(input('输入次数:'))
    except ValueError:
        print('参数错误')
        quit() 
    if count == 0 or count > 999:
        print('次数过大或过小')
        quit()
    print('次数确认为 %d' % count)
    # 获取 博文 列表,由于测试时我的博文只有一页所以 只能获得一页的列表
    #blog_list = get_blog_list("https://youtu.be/RysyB_Zcjlo")
    #print ("==== blog_list , type ",blog_list, type(blog_list))
    #if len(blog_list) == 0:
    #    print('未找到Blog列表')
    #    quit()
    print('启动!!!!!!!!!!!!!!!!!!!!')
    # 迭代一下 使用多线程
    
    index = 0
    url_list = []
    #maxConnection = 10 
    for i in range(10):
        # 补全头部
        each_link = blogurl + '?' + str(i)
        print ("===== each_link, type",each_link,type(each_link))
        print ("===== index, type",index,type(index))
        
        url_list.append(each_link)
        index += 1
    # 有多少个帖子就开多少个线程的一半 let's go
    pool = ThreadPool(15)
    batch_result = pool.map(click_blog, url_list)
    print ("===== batch_result, type",batch_result,type(batch_result))
    pool.close()
    pool.join()
    print('完成任务!!!!!!!!!!!!!!!!!!!!')









    



===initialization
begin getProxy
finish getProxy
('===== proxies:,type ', [None, '221.211.193.51:80', '120.7.84.59:8118', '203.91.121.76:3128', '180.168.179.193:8080', '61.6.64.73:53281', '178.150.144.60:53281', '119.42.78.61:62225', '194.186.213.38:53281', '46.101.72.238:8118', '61.6.85.186:53281', '180.251.65.46:62225', '61.6.158.52:53281', '62.218.42.146:3128', '14.136.195.130:8888', '47.91.235.15:80', '50.232.30.218:3128', '167.114.47.231:3128', '35.159.6.72:3128', '118.97.129.219:8080', '46.218.35.59:3129', '208.83.106.105:9999', '46.36.65.10:3128', '142.75.2.14:3128', '171.101.236.116:3128', '182.253.139.36:65301', '181.214.224.96:3128', '175.209.34.197:8080', '80.83.20.14:80', '149.56.64.58:80', '202.69.60.228:8080', '203.190.14.105:8080', '203.58.117.34:80', '45.76.90.81:3128', '106.104.83.69:9999', '203.142.34.36:62225', '206.127.141.67:80', '124.41.213.92:62225', '94.177.175.232:3128', '45.77.141.92:3128', '165.227.144.174:80', '125.31.19.26:80', '179.241.166.41:8080', '45.32.19.109:3128', '200.229.193.106:8080', '165.84.167.54:8080', '195.154.77.130:3128', '52.78.122.80:80', '217.24.160.10:3128', '47.52.102.226:9999', '61.6.41.233:62225', '35.199.48.210:80', '195.154.42.249:3128', '61.7.181.240:62225', '35.199.54.38:80', '203.104.192.230:3128', '78.46.200.194:80', '5.13.218.242:80', '173.161.162.68:8118', '203.189.141.162:63909', '213.136.77.246:80', '212.83.164.85:80', '45.77.141.222:3128', '213.136.89.121:80', '203.146.82.253:3128', '125.24.120.184:8080', '36.67.114.226:65309', '218.50.2.102:8080', '45.79.0.108:1080', '45.76.88.208:3128', '47.89.241.103:3128', '125.24.120.172:8080', '35.199.4.125:80', '187.49.206.162:80', '216.56.48.118:9000', '94.250.249.190:80', '221.133.44.142:8080', '41.159.141.179:8080', '110.78.148.201:62225', '186.46.153.174:62225', '209.159.156.199:80', '103.43.203.209:65301', '52.213.142.194:80', '81.177.101.171:80', '114.134.190.28:62225', '45.77.119.206:8118', '171.100.221.4:8080', '14.142.167.178:3128', '59.152.7.61:53281', '177.128.159.70:62225', '91.197.132.38:53281', '78.26.207.173:53281', '167.249.68.66:8080', '212.110.20.141:88', '195.209.125.98:53281', '186.133.13.202:21320', '182.162.143.90:80', '138.197.192.64:65000', '108.165.2.110:80', '186.250.232.109:53281', '154.72.74.82:53281', '177.128.154.128:62225', '45.76.53.88:80', '196.223.140.170:63909'], <type 'list'>)
有效代理个数为 : 104
('URL is :', 'https://youtu.be/RysyB_Zcjlo')
输入次数:2
次数确认为 2
启动!!!!!!!!!!!!!!!!!!!!
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?0', <type 'str'>)
('===== index, type', 0, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?1', <type 'str'>)
('===== index, type', 1, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?2', <type 'str'>)
('===== index, type', 2, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?3', <type 'str'>)
('===== index, type', 3, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?4', <type 'str'>)
('===== index, type', 4, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?5', <type 'str'>)
('===== index, type', 5, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?6', <type 'str'>)
('===== index, type', 6, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?7', <type 'str'>)
('===== index, type', 7, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?8', <type 'str'>)
('===== index, type', 8, <type 'int'>)
('===== each_link, type', 'https://youtu.be/RysyB_Zcjlo?9', <type 'str'>)
('===== index, type', 9, <type 'int'>)
当前访问 Blog https://youtu.be/RysyB_Zcjlo?0 第 0 次
智能切换代理：120.7.84.59:8118当前访问 Blog https://youtu.be/RysyB_Zcjlo?1 第 0 次 当前访问 Blog https://youtu.be/RysyB_Zcjlo?3 第 0 次 当前访问 Blog https://youtu.be/RysyB_Zcjlo?5 第 0 次当前访问 Blog https://youtu.be/RysyB_Zcjlo?6 第 0 次当前访问 Blog https://youtu.be/RysyB_Zcjlo?7 第 0 次

当前访问 Blog https://youtu.be/RysyB_Zcjlo?2 第 0 次
当前访问 Blog https://youtu.be/RysyB_Zcjlo?4 第 0 次


当前访问 Blog https://youtu.be/RysyB_Zcjlo?9 第 0 次当前访问 Blog https://youtu.be/RysyB_Zcjlo?8 第 0 次
智能切换代理：186.250.232.109:53281
智能切换代理：200.229.193.106:8080
智能切换代理：175.209.34.197:8080智能切换代理：186.133.13.202:21320智能切换代理：45.77.119.206:8118
智能切换代理：125.31.19.26:80
智能切换代理：221.133.44.142:8080
智能切换代理：110.78.148.201:62225


智能切换代理：61.6.85.186:53281



Access Successfully !
当前访问 Blog https://youtu.be/RysyB_Zcjlo?4 第 1 次
智能切换代理：62.218.42.146:3128
Access Successfully !
当前访问 Blog https://youtu.be/RysyB_Zcjlo?5 第 1 次
智能切换代理：114.134.190.28:62225
Access Successfully !
当前访问 Blog https://youtu.be/RysyB_Zcjlo?9 第 1 次
Access Successfully ! Access Successfully !智能切换代理：186.133.13.202:21320
Access Successfully !
 Access Successfully ! Access Successfully !
当前访问 Blog https://youtu.be/RysyB_Zcjlo?2 第 1 次
当前访问 Blog https://youtu.be/RysyB_Zcjlo?8 第 1 次Access Successfully !
Access Successfully !
当前访问 Blog https://youtu.be/RysyB_Zcjlo?3 第 1 次
当前访问 Blog https://youtu.be/RysyB_Zcjlo?1 第 1 次

当前访问 Blog https://youtu.be/RysyB_Zcjlo?0 第 1 次

智能切换代理：173.161.162.68:8118
智能切换代理：61.6.41.233:62225当前访问 Blog https://youtu.be/RysyB_Zcjlo?7 第 1 次
当前访问 Blog https://youtu.be/RysyB_Zcjlo?6 第 1 次智能切换代理：52.78.122.80:80
智能切换代理：208.83.106.105:9999

智能切换代理：218.50.2.102:8080

智能切换代理：203.58.117.34:80

智能切换代理：182.162.143.90:80

Access Successfully !
Access Successfully !
Access Successfully !
Access Successfully !
Access Successfully !
Access Successfully !
Access Successfully !
Access Successfully !
Access Successfully !
Access Successfully !
('===== batch_result, type', [None, None, None, None, None, None, None, None, None, None], <type 'list'>)
完成任务!!!!!!!!!!!!!!!!!!!!