In [ ]:
import re
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient

In [ ]:
db_ip = '192.168.1.21'
db_port = 27017
db_kline_name = 'ricequant'
db_hunter_name = 'hunter'
col_stock_sector_name = 'stock_sector'

client = MongoClient(db_ip, db_port)
db_kline = client[db_kline_name]
code_list = db_kline.collection_names(include_system_collections=False)

col_stock_sector = client[db_hunter_name][col_stock_sector_name]

In [ ]:
base_url = 'http://basic.10jqka.com.cn/'
company_post_url = '/company.html'
concept_post_url = '/concept.html'

In [ ]:
def get_stock_industry(code):
    res = requests.get(base_url + code + company_post_url)
    soup = BeautifulSoup(res.text, 'lxml')
    table = soup.find('table', attrs={'class':'m_table'})
    industry = table.find_all('tr')[1].find_all('td')[1].findAll(text=True)[1]
    pattern = re.compile('.*\w.*')
    if pattern.search(industry):
        industry = industry.split(' — ')
        industry = list(set(industry))
    else:
        print('Can not find industry for {}, raw industry: {}'.format(code, industry))
        industry = []
    return industry

def get_stock_concept(code):
    res = requests.get(base_url + code + concept_post_url)
    soup = BeautifulSoup(res.text, 'lxml')
    
    normal_concept = []
    normal_div = soup.find_all('div', attrs={'id':'concept'})
    if len(normal_div) > 0:
        normal_tables = normal_div[0].find_all('table', attrs={'class':'gnContent'})
        if len(normal_tables) > 0:
            for item in normal_tables[0].find_all('td', attrs={'class':'gnName'}):
                normal_concept.append(item.text.strip())
    
    emerging_concept = []
    emerging_div = soup.find_all('div', attrs={'id':'emerging'})
    if len(emerging_div) > 0:
        emerging_tables = emerging_div[0].find_all('table', attrs={'class':'gnContent'})
        if len(emerging_tables) > 0:
            for item in emerging_tables[0].find_all('td', attrs={'class':'gnStockList'}):
                emerging_concept.append(item.text.strip())
    
    other_concept = []
    other_div = soup.find_all('div', attrs={'id':'other'})
    if len(other_div) > 0:
        other_tables = other_div[0].find_all('table', attrs={'class':'gnContent'})
        if len(other_tables) > 0:
            for item in other_tables[0].find_all('td', attrs={'class':'gnStockList'}):
                other_concept.append(item.text.strip())
    
    return normal_concept, emerging_concept, other_concept

In [ ]:
stock_category = []
for idx, code in enumerate(code_list):
    simple_code = code.split('.')[0]
    ths_industry = get_stock_industry(simple_code)
    ths_normal_concept, ths_emerging_concept, ths_other_concept = get_stock_concept(simple_code)
    stock_info = {'code': code,
                  'ths_industry': ths_industry,
                  'ths_normal_concept': ths_normal_concept,
                  'ths_emerging_concept': ths_emerging_concept,
                  'ths_other_concept': ths_other_concept}
    stock_category.append(stock_info)
    print('{} {}: {} {} {} {}'.format(idx, code, ths_industry, ths_normal_concept, ths_emerging_concept, ths_other_concept))
print('Finished')

In [ ]:
stock_category_doc = []
for st in stock_category:
    for cat in st['ths_industry']:
        stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_industry'})
    for cat in st['ths_normal_concept']:
        stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_normal_concept'})
    for cat in st['ths_emerging_concept']:
        stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_emerging_concept'})
    for cat in st['ths_other_concept']:
        stock_category_doc.append({'windCode': st['code'], 'sector': cat, 'type': 'ths_other_concept'})

In [ ]:
col_stock_sector.delete_many({})

if stock_category_doc and len(stock_category_doc) > 0:
    col_stock_sector.insert_many(stock_category_doc)

In [ ]: