In [ ]:
import requests
from bs4 import BeautifulSoup
import re
In [ ]:
url_list = open('url.csv','r').read().split('\n')
In [ ]:
len(url_list)
In [ ]:
print (url_list[83])
In [ ]:
bili_url = url_list[83]
%time res = requests.get(bili_url)
print ('status: ',res.status_code)
print (res.text[:600])
In [ ]:
print (url_list[80])
In [ ]:
bad_url = url_list[80]
try:
%time res = requests.get(bad_url,timeout=10)
print ('status: ',res.status_code)
print (res.text[:600])
except:
print ('cannot open:',bad_url)
In [ ]:
print (url_list[45])
In [ ]:
zhihu_url = url_list[45]
header = {'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
try:
%time res = requests.get(zhihu_url,timeout=10)
print ('status: ',res.status_code)
print (res.text[:600])
except:
print ('cannot open:',zhihu_url)
In [ ]:
zhihu_url = url_list[45]
header = {'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
try:
%time res = requests.get(zhihu_url,timeout=10,headers=header)
print ('status: ',res.status_code)
print (res.text[:600])
except:
print ('cannot open:',zhihu_url)
In [ ]:
print (url_list[0])
In [ ]:
luanma_url = url_list[0]
try:
%time res = requests.get(luanma_url,timeout=10,headers=header)
print ('status: ',res.status_code)
print (res.text[:600])
except:
print ('cannot open:',luanma_url)
In [ ]:
luanma_url = url_list[0]
try:
%time res = requests.get(luanma_url,timeout=10,headers=header)
res.encoding = re.findall(r'charset="*([a-zA-Z0-9-]+)',res.text)[0]
print ('status: ',res.status_code)
print (res.text[:600])
except:
print ('cannot open:',luanma_url)
In [ ]:
from bs4 import BeautifulSoup
In [ ]:
print (url_list[67])
In [ ]:
crawl_url = url_list[67]
try:
%time res = requests.get(crawl_url,timeout=10,headers=header)
res.encoding = re.findall(r'charset="*([a-zA-Z0-9-]+)',res.text)[0]
print ('status: ',res.status_code)
print (res.text[:600])
html = res.text
except:
print ('cannot open:',crawl_url)
In [ ]:
bs = BeautifulSoup(html,'lxml')
In [ ]:
bs.title.text
In [ ]:
bs.find("meta", {"name":"description"})['content']
In [ ]:
bs.find('h1').text
In [ ]:
img_url = bs.findAll(src=re.compile('.+\.jpg'))[-2]['src']
print (img_url)
In [ ]:
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "http:"+img_url)
In [ ]:
para = dict([('url'+str(i),i) for i in range(1,101)])
dict(list(para.items())[0:5])
In [ ]:
para['token']='iOkjn2dsAl7js4iD'
In [ ]:
submit_url = "http://hackathon.mzsvn.com/submit.php"
sub_res = requests.post(submit_url,data=para)
In [ ]:
sub_res.text
作者:William