In [ ]:
import re
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
In [ ]:
db_ip = '192.168.1.21'
db_port = 27017
db_kline_name = 'ricequant'
db_hunter_name = 'hunter'
col_stock_sector_name = 'stock_sector'
client = MongoClient(db_ip, db_port)
db_kline = client[db_kline_name]
code_list = db_kline.collection_names(include_system_collections=False)
col_stock_sector = client[db_hunter_name][col_stock_sector_name]
In [ ]:
base_url = 'http://basic.10jqka.com.cn/'
company_post_url = '/company.html'
concept_post_url = '/concept.html'
In [ ]:
def get_stock_industry(code):
res = requests.get(base_url + code + company_post_url)
soup = BeautifulSoup(res.text, 'lxml')
table = soup.find('table', attrs={'class':'m_table'})
industry = table.find_all('tr')[1].find_all('td')[1].findAll(text=True)[1]
pattern = re.compile('.*\w.*')
if pattern.search(industry):
industry = industry.split(' — ')
industry = list(set(industry))
else:
print('Can not find industry for {}, raw industry: {}'.format(code, industry))
industry = []
return industry
def get_stock_concept(code):
res = requests.get(base_url + code + concept_post_url)
soup = BeautifulSoup(res.text, 'lxml')
normal_concept = []
normal_div = soup.find_all('div', attrs={'id':'concept'})
if len(normal_div) > 0:
normal_tables = normal_div[0].find_all('table', attrs={'class':'gnContent'})
if len(normal_tables) > 0:
for item in normal_tables[0].find_all('td', attrs={'class':'gnName'}):
normal_concept.append(item.text.strip())
emerging_concept = []
emerging_div = soup.find_all('div', attrs={'id':'emerging'})
if len(emerging_div) > 0:
emerging_tables = emerging_div[0].find_all('table', attrs={'class':'gnContent'})
if len(emerging_tables) > 0:
for item in emerging_tables[0].find_all('td', attrs={'class':'gnStockList'}):
emerging_concept.append(item.text.strip())
other_concept = []
other_div = soup.find_all('div', attrs={'id':'other'})
if len(other_div) > 0:
other_tables = other_div[0].find_all('table', attrs={'class':'gnContent'})
if len(other_tables) > 0:
for item in other_tables[0].find_all('td', attrs={'class':'gnStockList'}):
other_concept.append(item.text.strip())
return normal_concept, emerging_concept, other_concept
In [ ]:
stock_category = []
for idx, code in enumerate(code_list):
simple_code = code.split('.')[0]
ths_industry = get_stock_industry(simple_code)
ths_normal_concept, ths_emerging_concept, ths_other_concept = get_stock_concept(simple_code)
stock_info = {'code': code,
'ths_industry': ths_industry,
'ths_normal_concept': ths_normal_concept,
'ths_emerging_concept': ths_emerging_concept,
'ths_other_concept': ths_other_concept}
stock_category.append(stock_info)
print('{} {}: {} {} {} {}'.format(idx, code, ths_industry, ths_normal_concept, ths_emerging_concept, ths_other_concept))
print('Finished')
In [ ]:
stock_category_doc = []
for st in stock_category:
for cat in st['ths_industry']:
stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_industry'})
for cat in st['ths_normal_concept']:
stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_normal_concept'})
for cat in st['ths_emerging_concept']:
stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_emerging_concept'})
for cat in st['ths_other_concept']:
stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_other_concept'})
In [ ]:
col_stock_sector.delete_many({})
if stock_category_doc and len(stock_category_doc) > 0:
col_stock_sector.insert_many(stock_category_doc)
In [ ]: