In [1]:
# https://pandas.pydata.org/pandas-docs/stable/io.html#io-read-html
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_html.html
# pip install lxml html5lib beautifulsoup4
In [2]:
import pandas as pd
In [3]:
url = 'https://info.finance.yahoo.co.jp/ranking/?kd=4'
dfs = pd.read_html(url)
In [4]:
print(len(dfs))
In [5]:
print(dfs[0].head())
In [6]:
print(dfs[0][['名称', '時価総額(百万円)']].head())
In [7]:
dfs[0].columns = ['順位', 'コード', '市場', '名称', '時刻', '取引値', '発行済み株式数', '時価総額(百万円)', '単元株数', '掲示板']
In [8]:
print(dfs[0][['名称', '時価総額(百万円)']].head())
In [9]:
url = 'https://ja.wikipedia.org/wiki/Python'
dfs = pd.read_html(url)
In [10]:
print(len(dfs))
In [11]:
dfs = pd.read_html(url, match='リリース日')
In [12]:
print(len(dfs))
In [13]:
print(dfs[0])
In [14]:
print(dfs[1])
In [15]:
dfs = pd.read_html(url, match='リリース日', header=0)
In [16]:
print(len(dfs))
In [17]:
print(dfs[0])
In [18]:
print(dfs[1])
In [19]:
df = pd.concat([dfs[0], dfs[1]], ignore_index=True).sort_values('リリース日[17]')
print(df)
In [20]:
df.to_csv('data/dst/pandas_read_html_sample.csv')