Andres De Castro
https://github.com/andres-de-castro/scraping
* https://github.com/damianavila/RISE
In [1]:
import pandas as pd
url = 'http://performance.morningstar.com/Performance/stock/split-history.action?&t=AAPL'
pd.read_html(url)[0]
Out[1]:
Target data lives in a table element
http://www2.tse.or.jp/tseHpFront/JJK020010Action.do?Show=Show #1301
In [2]:
# The naive approach
url = 'http://quote.jpx.co.jp/jpx/template/quote.cgi?F=tmp/e_stock_detail&MKTN=T&QCODE=1301'
try:
pd.read_html(url)
except Exception as e:
print (str(e))
Thought Process
http://www2.tse.or.jp/tseHpFront/JJK020010Action.do?Show=Show
%%bash
curl 'http://quote.jpx.co.jp/jpx/template/quote.cgi?F=tmp/e_stock_detail&MKTN=T&QCODE=1301' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8,es;q=0.6' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8' -H 'Referer: http://www2.tse.or.jp/tseHpFront/JJK020010Action.do' -H 'Cookie: TS4be622=5de6667395943132172f01acdabc66df16cd3f45e0bd3db2578e4e0e' -H 'Connection: keep-alive' -H 'Cache-Control: max-age=0' --compressed
In [7]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
codes = ['9986', '9987', '9989', '9990'] #'9991', '9992', '9993', '9994', '9995', '9996']
for code in codes:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Cookie': '__utma=139475176.428689694.1438095265.1439320455.1440102255.14; __utmz=139475176.1440102255.14.6.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); TS4be622=c6390468d7aed6d150c549c11b5dbc654181b62eb149119556167434',
'Referer': 'http://www2.tse.or.jp/tseHpFront/JJK020010Action.do'
}
payload = {'F': 'tmp/e_stock_detail',
'MKTN': 'T',
'QCODE': str(code)
}
r = requests.post('http://quote.jpx.co.jp/jpx/template/quote.cgi?F=tmp/e_stock_detail&MKTN=T&QCODE=' + str(code), data=payload, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
values = soup.find(id="histData")['value']
df = pd.DataFrame.from_csv(StringIO(values), sep=",", parse_dates=False, header=None)
df = df.drop(df.columns[-1],1)
df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
df.index.names = ['Date']
df.tail(10)
Out[7]:
In [9]:
import sys
from tornado import gen, ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue
class Scraper():
@gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
@gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout, http_client):
while True:
url = yield self.queue.get()
try:
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
except Exception as e:
sys.stderr.write('Destination {0} returned error {1}'.format(url, str(e) + '\n'))
future = self.http_client.fetch(request)
def done_callback(future):
body = future.result().body
url = future.result().effective_url
transform(body, url=url)
self.queue.task_done()
future.add_done_callback(done_callback)
In [5]:
%%bash
time python tse.py
In [6]:
pd.read_csv('tse.csv').head(10)
Out[6]: