In [2]:
# %load SSEannouncement.py
__author__ = 'pchaosgit'
__email__ = 'drifthua@gmail.com'
# -*-coding:utf-8-*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
import datetime
import wget
import os
import tempfile
from bs4 import BeautifulSoup
import logging

logging.basicConfig(
        level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S')

def login(username,password):
    browser = webdriver.Firefox()
    browser.get("http://weibo.com/login.php")

    user = browser.find_element_by_xpath("//*[@id='pl_login_form']/div[5]/div[1]/div/input")
    user.send_keys(username, Keys.ARROW_DOWN)

    passwd = browser.find_element_by_xpath("//*[@id='pl_login_form']/div[5]/div[2]/div/input")
    passwd.send_keys(password, Keys.ARROW_DOWN)

    vcode = browser.find_element_by_xpath("//*[@id='pl_login_form']/div[5]/div[3]/div/input")
    if vcode:
        code = raw_input("verify code:")
        if code:
            vcode.send_keys(code, Keys.ARROW_DOWN)

    browser.find_element_by_xpath("//*[@id='pl_login_form']/div[5]/div[6]/div[1]/a/span").click()

    logging.debug( browser.find_element_by_xpath("//*[@id='v6_pl_content_homefeed']/div[2]/div[3]/div[1]/div[1]/div[3]/div[1]/a[1]").get_attribute("usercard"))

def getBrower(browserName="firefox"):
    if browserName.lower() == "firefox":
        return webdriver.Firefox()

def test():
    username = "email"
    passwd = "passwd"
    login(username, passwd)

def testsse(stcode):
  stcode = stcode
  #sseSearch(stcode)
  sseSearchbyhrefs(stcode)

def  sseSearch(stcode):
    browser = webdriver.Firefox()
    wait = ui.WebDriverWait(browser, 10)
    browser.get("http://www.sse.com.cn/disclosure/listedinfo/announcement/")
    #browser.get("http://www.sse.com.cn/assortment/stock/list/stockdetails/announcement/index.shtml?COMPANY_CODE=600401&productId=600401&bt=%E5%85%A8%E9%83%A8&static=t")
    #assert "公司公告" in driver.title
    sseInput = browser.find_element_by_xpath("//*[@id='productId']")
    sseInput.send_keys(stcode)
    sseInput.send_keys(Keys.RETURN)
    wait.until(lambda browser: browser.find_element_by_css_selector('a._blank'))

def stockannouncementURL(stcode, startDate=datetime.date.today( ), endDate= datetime.date.today( )):
    global browser
    urlparam = {"COMPANY_CODE": stcode, "productId": stcode, "bt": "全部", "static": "t"}
    url = "http://www.sse.com.cn/assortment/stock/list/stockdetails/announcement/index.shtml"
    return browser.get(url + getparams(urlparam))

def  sseSearchbyhrefs(stcode):
    global browser
    wait = ui.WebDriverWait(browser, 10)
    r = stockannouncementURL(stcode)
    wait.until(lambda browser: browser.find_element_by_xpath("//div[@id='announcementDiv']"))
    logging.debug("browser.title: " + browser.title)
    #assert stcode in browser.title
    logging.debug(browser.find_element_by_xpath("//div[@id='announcementDiv']").text)

def getparams(urlparam):
    str="?"
    for k,v in urlparam.items():
        str += "&" + k
        str += "=" + v
    return str.lstrip("&")

def downloadannouncement(url, tagetfileName):
    file_name, file_ext = os.path.splitext(url)
    # todo 取得url内的文件后缀,加到filename
    filename = tempfile.gettempdir() + "/" + tagetfileName + file_ext
    if not os.path.isfile(filename):
        logging.info("downloading: {0} {1}".format(url, filename))
        #使用以下语句,url长度可能会引起错误:“ urwid.canvas.CanvasError: Canvas text is wider than the maxcol specified“
        # logging.info("downloading: %{url} %{filename}" %{url ,  filename})
        wget.download(url, filename)
    else:
        logging.info("FileExist: " + url +  filename)

def getAllPDF(soup):
    for tag in soup.find_all("div", {"id":"announcementDiv"}):
        for tagchild in tag.findAll("tr"):
            if len(tagchild.findAll("a")):
                taggrandchild = tagchild.findAll("td")
                filename = taggrandchild[1].string + "_" + taggrandchild[2].string + "_" + taggrandchild[4].string + "_" + taggrandchild[0].string
                for greatgrandson in taggrandchild[0].findAll("a"):
                    downloadannouncement(greatgrandson["href"], filename)

if __name__ == "__main__":
    logging.info(" started")
    browser = getBrower()
    try:
        browser.maximize_window()
        stcode = "600401"
        stcode = "601169"
        testsse(stcode)
        soup = BeautifulSoup(browser.page_source, 'lxml')
        getAllPDF(soup)
        # stcode = "600000"
        # testsse(stcode)
        # soup = BeautifulSoup(browser.page_source, 'lxml')
        # getAllPDF(soup)
    finally:
        browser.close()
        logging.info(" Ended")

In [ ]:
import numpy as np

from scipy.ndimage import filters

In [3]:
import numpy as np

from scipy.ndimage import filters