需要的python包:

  • requests
  • bs4
  • re

In [ ]:
import requests
from bs4 import BeautifulSoup
import re

从秒针下载url数据集

http://hackathon.mzsvn.com/download.php

download_url = "http://hackathon.mzsvn.com/download.php" url_list = requests.get(download_url).text.strip().split('\n')

In [ ]:
url_list = open('url.csv','r').read().split('\n')

In [ ]:
len(url_list)

最正常的网站


In [ ]:
print (url_list[83])

In [ ]:
bili_url = url_list[83]
%time res = requests.get(bili_url)
print ('status: ',res.status_code)
print (res.text[:600])

无法打开的链接

设置链接超时,并处理异常


In [ ]:
print (url_list[80])

In [ ]:
bad_url = url_list[80]
try:
    %time res = requests.get(bad_url,timeout=10)
    print ('status: ',res.status_code)
    print (res.text[:600])
except:
    print ('cannot open:',bad_url)

禁止机器人访问的网站

加入伪装浏览器头信息


In [ ]:
print (url_list[45])

In [ ]:
zhihu_url = url_list[45]
header = {'User-Agent': 
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
try:
    %time res = requests.get(zhihu_url,timeout=10)
    print ('status: ',res.status_code)
    print (res.text[:600])
except:
    print ('cannot open:',zhihu_url)

In [ ]:
zhihu_url = url_list[45]
header = {'User-Agent': 
          'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
try:
    %time res = requests.get(zhihu_url,timeout=10,headers=header)
    print ('status: ',res.status_code)
    print (res.text[:600])
except:
    print ('cannot open:',zhihu_url)

非常规编码的网站

通过正则表达式抓取编码方式


In [ ]:
print (url_list[0])

In [ ]:
luanma_url = url_list[0]
try:
    %time res = requests.get(luanma_url,timeout=10,headers=header)
    print ('status: ',res.status_code)
    print (res.text[:600])
except:
    print ('cannot open:',luanma_url)

In [ ]:
luanma_url = url_list[0]
try:
    %time res = requests.get(luanma_url,timeout=10,headers=header)
    res.encoding = re.findall(r'charset="*([a-zA-Z0-9-]+)',res.text)[0]
    print ('status: ',res.status_code)
    print (res.text[:600])   
except:
    print ('cannot open:',luanma_url)

抓取网页关键信息

  • 标题
  • 描述性内容
  • 大标题(h1)
  • 图片

等等


In [ ]:
from bs4 import BeautifulSoup

In [ ]:
print (url_list[67])

In [ ]:
crawl_url = url_list[67]
try:
    %time res = requests.get(crawl_url,timeout=10,headers=header)
    res.encoding = re.findall(r'charset="*([a-zA-Z0-9-]+)',res.text)[0]
    print ('status: ',res.status_code)
    print (res.text[:600])
    html = res.text
except:
    print ('cannot open:',crawl_url)

In [ ]:
bs = BeautifulSoup(html,'lxml')

In [ ]:
bs.title.text

In [ ]:
bs.find("meta", {"name":"description"})['content']

In [ ]:
bs.find('h1').text

In [ ]:
img_url = bs.findAll(src=re.compile('.+\.jpg'))[-2]['src']
print (img_url)

In [ ]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "http:"+img_url)

提交方法

将分数与url编号打包,用post方式发送到秒针服务器


In [ ]:
para = dict([('url'+str(i),i) for i in range(1,101)])

dict(list(para.items())[0:5])

加入队伍token


In [ ]:
para['token']='iOkjn2dsAl7js4iD'

提交


In [ ]:
submit_url = "http://hackathon.mzsvn.com/submit.php"
sub_res = requests.post(submit_url,data=para)

In [ ]:
sub_res.text

作者:William