In [1]:
import csv
import glob
import os
import pprint
from bs4 import BeautifulSoup
import markdown2
import pandas as pd
In [2]:
def get_links_from_md(file_path, markdowner=markdown2.Markdown()):
with open(file_path) as f:
md = f.read()
html = markdowner.convert(md)
soup = BeautifulSoup(html, 'html.parser')
l = [[file_path, a.text, a.attrs.get('href')] for a in soup.find_all('a')]
return l
In [3]:
def get_links_from_md_in_list(file_path_list, markdowner=markdown2.Markdown()):
l = []
for path in file_path_list:
l.extend(get_links_from_md(path, markdowner))
return l
In [4]:
def get_links_from_md_in_dir(dir_path, markdowner=markdown2.Markdown()):
return get_links_from_md_in_list(
glob.glob(os.path.join(dir_path, '**', '*.md'), recursive=True),
markdowner
)
In [5]:
with open('data/src/md/test1.md') as f:
print(f.read())
In [6]:
pprint.pprint(get_links_from_md('data/src/md/test1.md'))
In [7]:
with open('data/src/md/test1.md') as f:
md = f.read()
In [8]:
markdowner = markdown2.Markdown()
html = markdowner.convert(md)
print(html)
In [9]:
l = BeautifulSoup(html, 'html.parser').find_all('a')
pprint.pprint(l)
In [10]:
a = l[0]
print(type(a))
In [11]:
print(a.attrs)
In [12]:
print(a.attrs.get('href'))
In [13]:
print(a.text)
In [14]:
html_en = markdowner.convert('abcde')
print(html_en)
In [15]:
print(BeautifulSoup(html_en, 'html.parser'))
In [16]:
print(BeautifulSoup(html_en, 'lxml'))
In [17]:
html_jp = markdowner.convert('あいうえお')
print(html_jp)
In [18]:
print(BeautifulSoup(html_jp, 'html.parser'))
In [19]:
print(BeautifulSoup(html_jp, 'lxml'))
In [20]:
print(BeautifulSoup(html_jp.encode(), 'lxml'))
In [21]:
print(BeautifulSoup('<p>abcdeあいうえお</p>', 'lxml'))
In [22]:
print(BeautifulSoup('<html>' + html_jp + '</html>', 'lxml'))
In [23]:
print(BeautifulSoup('abcde' + html_jp, 'lxml'))
In [24]:
pprint.pprint(get_links_from_md_in_list(glob.glob('data/src/md/*.md')))
In [25]:
pprint.pprint(get_links_from_md_in_dir('data/src/md/'))
In [26]:
l = get_links_from_md('data/src/md/test1.md')
l.insert(0, ['file', 'anchor text', 'URL'])
with open('data/temp/md_links_csv.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(l)
In [27]:
l = get_links_from_md('data/src/md/test1.md')
df = pd.DataFrame(l, columns=['file', 'anchor text', 'URL'])
print(df)
In [28]:
df.to_csv('data/temp/md_links_df.csv', index=False)