In [1]:
import re
import requests
import bs4
from bs4 import BeautifulSoup
from collections import namedtuple
In [24]:
home_url = 'http://guba.eastmoney.com/'
url = 'http://guba.eastmoney.com/list,600519_1.html'
def get_URL(url):
headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
response = requests.get(url, headers=headers)
return response.text
def get_urlSoup(url):
headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
return soup
soup = get_urlSoup(url)
In [25]:
summary = soup.find_all('div', attrs={'class':'articleh'})#.find_all('span')
In [142]:
pattern = re.compile('em class')
simple_summary = [x for x in summary if not pattern.search(str(x))]
In [139]:
class brief_item:
def __init__(self, soup):
assert isinstance(soup, bs4.element.Tag)
self.home_url = 'http://guba.eastmoney.com/'
self.filter_pattern = re.compile('<.+?>')
# 阅读量
try:
self.viewer = int(soup.find_all('span', attrs={'class':'l1'})[0].string)
except:
self.viewer = 0
# 回复量
try:
self.response = int(soup.find_all('span', attrs={'class':'l2'})[0].string)
except:
self.viewer = 0
# 标题,链接,分类
try:
foo_l3 = soup.find_all('span', attrs={'class':'l3'})[0]
self.href = foo_l3.find_all('a')[0]['href']
self.title = foo_l3.find_all('a')[0]['title']
self.attr = foo_l3.find_all('em')[0].string
except:
self.href = self.title = self.attr = 'none'
# 用户信息
try:
self.user = soup.find_all('span', attrs={'class':'l4'})[0].find_all('a')[0].string
self.user_id = soup.find_all('span', attrs={'class':'l4'})[0].find_all('a')[0]['data-popper']
except:
self.user = self.user_id = 'none'
self.parse_detail()
def get_urlSoup(self, url):
headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
return soup
def parse_detail(self):
detail_soup = self.get_urlSoup(self.home_url+self.href)
bar = detail_soup.find_all('div', attrs={'class':'stockcodec'})[0]
self.detail = self.filter_pattern.sub('', str(bar)).strip()
# return self.detail
@property
def show(self):
print('title : %s' % (self.title))
print('attr : %s' % (self.attr))
print('href : %s' % (self.href))
print('user : %s' % (self.user))
print('user_id : %s' % (self.user_id))
print('viewer : %s' % (self.viewer))
print('response : %s' % (self.response))
print('detail : %s' % (self.detail))
In [140]:
a = brief_item(summary[0])
In [141]:
a.show