In [1]:
import requests
from bs4 import BeautifulSoup
In [53]:
help(requests.get)
In [5]:
url = 'http://computational-class.github.io/bigdata/data/test.html'
content = requests.get(url)
help(content)
In [6]:
print(content.text)
In [7]:
content.encoding
Out[7]:
Beautiful Soup is a Python library designed for quick turnaround projects like screen-scraping. Three features make it powerful:
lxml
and html5lib
.Beautiful Soup supports the html.parser included in Python’s standard library
but it also supports a number of third-party Python parsers. One is the lxml parser lxml
. Depending on your setup, you might install lxml with one of these commands:
$ apt-get install python-lxml
$ easy_install lxml
$ pip install lxml
In [2]:
url = 'http://computational-class.github.io/bigdata/data/test.html'
content = requests.get(url)
content = content.text
soup = BeautifulSoup(content, 'html.parser')
soup
Out[2]:
In [10]:
print(soup.prettify())
The Dormouse's story
, 右键检查Inspectbody > p.title > b
In [4]:
soup.select('body > p.title > b')[0].text
Out[4]:
In [5]:
soup.select('title')
Out[5]:
In [6]:
soup.select('a')
Out[6]:
In [7]:
soup.select('b')
Out[7]:
In [8]:
soup.select('.title')
Out[8]:
In [26]:
soup.select('.sister')
Out[26]:
In [27]:
soup.select('.story')
Out[27]:
In [9]:
soup.select('#link1')
Out[9]:
In [16]:
soup.select('#link1')[0]['href']
Out[16]:
In [10]:
soup.select('p #link1')
Out[10]:
In [17]:
soup.select("head > title")
Out[17]:
In [72]:
soup.select("body > p")
Out[72]:
In [30]:
soup('p')
Out[30]:
In [31]:
soup.find_all('p')
Out[31]:
In [32]:
[i.text for i in soup('p')]
Out[32]:
In [34]:
for i in soup('p'):
print(i.text)
In [35]:
for tag in soup.find_all(True):
print(tag.name)
In [36]:
soup('head') # or soup.head
Out[36]:
In [37]:
soup('body') # or soup.body
Out[37]:
In [38]:
soup('title') # or soup.title
Out[38]:
In [39]:
soup('p')
Out[39]:
In [40]:
soup.p
Out[40]:
In [41]:
soup.title.name
Out[41]:
In [42]:
soup.title.string
Out[42]:
In [43]:
soup.title.text
# 推荐使用text方法
Out[43]:
In [44]:
soup.title.parent.name
Out[44]:
In [45]:
soup.p
Out[45]:
In [46]:
soup.p['class']
Out[46]:
In [47]:
soup.find_all('p', {'class', 'title'})
Out[47]:
In [19]:
soup.find_all('p', class_= 'title')
Out[19]:
In [49]:
soup.find_all('p', {'class', 'story'})
Out[49]:
In [34]:
soup.find_all('p', {'class', 'story'})[0].find_all('a')
Out[34]:
In [51]:
soup.a
Out[51]:
In [52]:
soup('a')
Out[52]:
In [53]:
soup.find(id="link3")
Out[53]:
In [54]:
soup.find_all('a')
Out[54]:
In [55]:
soup.find_all('a', {'class', 'sister'}) # compare with soup.find_all('a')
Out[55]:
In [56]:
soup.find_all('a', {'class', 'sister'})[0]
Out[56]:
In [57]:
soup.find_all('a', {'class', 'sister'})[0].text
Out[57]:
In [58]:
soup.find_all('a', {'class', 'sister'})[0]['href']
Out[58]:
In [59]:
soup.find_all('a', {'class', 'sister'})[0]['id']
Out[59]:
In [71]:
soup.find_all(["a", "b"])
Out[71]:
In [38]:
print(soup.get_text())
In [11]:
from IPython.display import display_html, HTML
HTML(url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ5MTE3OA==&mid=206241627&idx=1&sn=471e59c6cf7c8dae452245dbea22c8f3&3rd=MzA3MDU4NTYzMw==&scene=6#rd')
# the webpage we would like to crawl
Out[11]:
In [12]:
url = "http://mp.weixin.qq.com/s?__biz=MzA3MjQ5MTE3OA==&mid=206241627&idx=1&sn=471e59c6cf7c8dae452245dbea22c8f3&3rd=MzA3MDU4NTYzMw==&scene=6#rd"
content = requests.get(url).text #获取网页的html文本
soup = BeautifulSoup(content, 'html.parser')
In [17]:
title = soup.select("#activity-name") # #activity-name
title[0].text.strip()
Out[17]:
In [18]:
soup.find('h2', {'class', 'rich_media_title'}).text.strip()
Out[18]:
In [25]:
print(soup.find('div', {'class', 'rich_media_meta_list'}) )
In [26]:
soup.select('#publish_time')
Out[26]:
In [27]:
article = soup.find('div', {'class' , 'rich_media_content'}).text
print(article)
In [30]:
rmml = soup.find('div', {'class', 'rich_media_meta_list'})
#date = rmml.find(id = 'post-date').text
rmc = soup.find('div', {'class', 'rich_media_content'})
content = rmc.get_text()
print(title[0].text.strip())
#print(date)
print(content)
In [15]:
!pip install wechatsogou --upgrade
In [16]:
import wechatsogou
# 可配置参数
# 直连
ws_api = wechatsogou.WechatSogouAPI()
# 验证码输入错误的重试次数,默认为1
ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
# 所有requests库的参数都能在这用
# 如 配置代理,代理列表中至少需包含1个 HTTPS 协议的代理, 并确保代理可用
ws_api = wechatsogou.WechatSogouAPI(proxies={
"http": "127.0.0.1:8889",
"https": "127.0.0.1:8889",
})
# 如 设置超时
ws_api = wechatsogou.WechatSogouAPI(timeout=0.1)
In [17]:
ws_api =wechatsogou.WechatSogouAPI()
ws_api.get_gzh_info('南航青年志愿者')
Out[17]:
In [19]:
articles = ws_api.search_article('南京航空航天大学')
In [20]:
for i in articles:
print(i)
Xpath 即为 XML 路径语言(XML Path Language),它是一种用来确定 XML 文档中某部分位置的语言。
Xpath 基于 XML 的树状结构,提供在数据结构树中找寻节点的能力。起初 Xpath 的提出的初衷是将其作为一个通用的、介于 Xpointer 与 XSL 间的语法模型。但是Xpath 很快的被开发者采用来当作小型查询语言。
获取元素的Xpath信息并获得文本: 这里的“元素的Xpath信息”是需要我们手动获取的,获取方式为:
In [31]:
import requests
from lxml import etree
url = 'https://movie.douban.com/subject/26611804/'
data = requests.get(url).text
s = etree.HTML(data)
豆瓣电影的名称对应的的xpath为xpath_title,那么title表达为:
title = s.xpath('xpath_info/text()')
其中,xpath_info为:
//*[@id="content"]/h1/span[1]
In [33]:
title = s.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
director = s.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')
actors = s.xpath('//*[@id="info"]/span[3]/span[2]/a/text()')
type1 = s.xpath('//*[@id="info"]/span[5]/text()')
type2 = s.xpath('//*[@id="info"]/span[6]/text()')
type3 = s.xpath('//*[@id="info"]/span[7]/text()')
time = s.xpath('//*[@id="info"]/span[11]/text()')
length = s.xpath('//*[@id="info"]/span[13]/text()')
score = s.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
In [34]:
print(title, director, actors, type1, type2, type3, time, length, score)
In [6]:
import requests
# https://movie.douban.com/subject/26611804/
url = 'https://api.douban.com/v2/movie/subject/26611804?apikey=0b2bdeda43b5688921839c8ecb20399b&start=0&count=20&client=&udid='
jsonm = requests.get(url).json()
In [11]:
jsonm.keys()
Out[11]:
In [3]:
#jsonm.values()
jsonm['rating']
Out[3]:
In [4]:
jsonm['alt']
Out[4]:
In [21]:
jsonm['casts'][0]
Out[21]:
In [10]:
jsonm['directors']
Out[10]:
In [13]:
jsonm['genres']
Out[13]:
In [55]:
import requests
from bs4 import BeautifulSoup
from lxml import etree
url0 = 'https://movie.douban.com/top250?start=0&filter='
data = requests.get(url0).text
s = etree.HTML(data)
In [56]:
s.xpath('//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[1]/a/span[1]/text()')[0]
Out[56]:
In [57]:
s.xpath('//*[@id="content"]/div/div[1]/ol/li[2]/div/div[2]/div[1]/a/span[1]/text()')[0]
Out[57]:
In [227]:
s.xpath('//*[@id="content"]/div/div[1]/ol/li[3]/div/div[2]/div[1]/a/span[1]/text()')[0]
Out[227]:
In [58]:
import requests
from bs4 import BeautifulSoup
url0 = 'https://movie.douban.com/top250?start=0&filter='
data = requests.get(url0).text
soup = BeautifulSoup(data, 'lxml')
In [59]:
movies = soup.find_all('div', {'class', 'info'})
In [60]:
len(movies)
Out[60]:
In [61]:
movies[0].a['href']
Out[61]:
In [62]:
movies[0].find('span', {'class', 'title'}).text
Out[62]:
In [63]:
movies[0].find('div', {'class', 'star'})
Out[63]:
In [64]:
movies[0].find('span', {'class', 'rating_num'}).text
Out[64]:
In [65]:
people_num = movies[0].find('div', {'class', 'star'}).find_all('span')[-1]
people_num.text.split('人评价')[0]
Out[65]:
In [66]:
for i in movies:
url = i.a['href']
title = i.find('span', {'class', 'title'}).text
des = i.find('div', {'class', 'star'})
rating = des.find('span', {'class', 'rating_num'}).text
rating_num = des.find_all('span')[-1].text.split('人评价')[0]
print(url, title, rating, rating_num)
In [67]:
for i in range(0, 250, 25):
print('https://movie.douban.com/top250?start=%d&filter='% i)
In [68]:
import requests
from bs4 import BeautifulSoup
dat = []
for j in range(0, 250, 25):
urli = 'https://movie.douban.com/top250?start=%d&filter='% j
data = requests.get(urli).text
soup = BeautifulSoup(data, 'lxml')
movies = soup.find_all('div', {'class', 'info'})
for i in movies:
url = i.a['href']
title = i.find('span', {'class', 'title'}).text
des = i.find('div', {'class', 'star'})
rating = des.find('span', {'class', 'rating_num'}).text
rating_num = des.find_all('span')[-1].text.split('人评价')[0]
listi = [url, title, rating, rating_num]
dat.append(listi)
In [69]:
import pandas as pd
df = pd.DataFrame(dat, columns = ['url', 'title', 'rating', 'rating_num'])
df['rating'] = df.rating.astype(float)
df['rating_num'] = df.rating_num.astype(int)
df.head()
Out[69]:
In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(df.rating_num)
plt.show()
In [19]:
plt.hist(df.rating)
plt.show()
In [11]:
# viz
fig = plt.figure(figsize=(16, 16),facecolor='white')
plt.plot(df.rating_num, df.rating, 'bo')
for i in df.index:
plt.text(df.rating_num[i], df.rating[i], df.title[i],
fontsize = df.rating[i],
color = 'red', rotation = 45)
plt.show()
In [123]:
df[df.rating > 9.4]
Out[123]:
In [69]:
alist = []
for i in df.index:
alist.append( [df.rating_num[i], df.rating[i], df.title[i] ])
blist =[[df.rating_num[i], df.rating[i], df.title[i] ] for i in df.index]
alist
Out[69]:
In [70]:
from IPython.display import display_html, HTML
HTML('<iframe src=http://nbviewer.jupyter.org/github/computational-class/bigdata/blob/gh-pages/vis/douban250bubble.html \
width=1000 height=500></iframe>')
Out[70]:
In [82]:
# headers = {
# 'Accept': 'application/json, text/javascript, */*; q=0.01',
# 'Accept-Encoding': 'gzip, deflate',
# 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Cookie': 'JSESSIONID=992CB756ADE61B87409672DC808FDD92',
# 'DNT': '1',
# 'Host': 'www.jszx.gov.cn',
# 'Pragma': 'no-cache',
# 'Referer': 'http://www.jszx.gov.cn/zxta/2019ta/',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
# }
打开http://www.jszx.gov.cn/zxta/2019ta/
所以数据的更新是使用js推送的
- 分析network中的内容,发现proposalList.jsp
- 查看它的header,并发现了form_data
In [71]:
import requests
from bs4 import BeautifulSoup
In [72]:
form_data = {'year':2019,
'pagenum':1,
'pagesize':20
}
url = 'http://www.jszx.gov.cn/wcm/zxweb/proposalList.jsp'
content = requests.get(url, form_data)
content.encoding = 'utf-8'
js = content.json()
In [74]:
js['data']['totalcount']
Out[74]:
In [75]:
dat = js['data']['list']
pagenum = js['data']['pagecount']
In [76]:
for i in range(2, pagenum+1):
print(i)
form_data['pagenum'] = i
content = requests.get(url, form_data)
content.encoding = 'utf-8'
js = content.json()
for j in js['data']['list']:
dat.append(j)
In [77]:
len(dat)
Out[77]:
In [78]:
dat[0]
Out[78]:
In [79]:
import pandas as pd
df = pd.DataFrame(dat)
df.head()
Out[79]:
In [158]:
df.groupby('type').size()
Out[158]:
In [80]:
url_base = 'http://www.jszx.gov.cn/wcm/zxweb/proposalInfo.jsp?pkid='
urls = [url_base + i for i in df['pkid']]
In [81]:
import sys
def flushPrint(www):
sys.stdout.write('\r')
sys.stdout.write('%s' % www)
sys.stdout.flush()
text = []
for k, i in enumerate(urls):
flushPrint(k)
content = requests.get(i)
content.encoding = 'utf-8'
js = content.json()
js = js['data']['binfo']['_content']
soup = BeautifulSoup(js, 'html.parser')
text.append(soup.text)
In [82]:
len(text)
Out[82]:
In [83]:
df['content'] = text
In [84]:
df.head()
Out[84]:
In [181]:
df.to_csv('../data/jszx2019.csv', index = False)
In [182]:
dd = pd.read_csv('../data/jszx2019.csv')
dd.head()
Out[182]:
In [ ]:
朋友会在“发现-看一看”看到你“在看”的内容