In [1]:
import csv
import glob
import os
import pprint

from bs4 import BeautifulSoup
import markdown2
import pandas as pd

In [2]:
def get_links_from_md(file_path, markdowner=markdown2.Markdown()):
    with open(file_path) as f:
        md = f.read()
    html = markdowner.convert(md)
    soup = BeautifulSoup(html, 'html.parser')
    l = [[file_path, a.text, a.attrs.get('href')] for a in soup.find_all('a')]
    return l

In [3]:
def get_links_from_md_in_list(file_path_list, markdowner=markdown2.Markdown()):
    l = []
    for path in file_path_list:
        l.extend(get_links_from_md(path, markdowner))
    return l

In [4]:
def get_links_from_md_in_dir(dir_path, markdowner=markdown2.Markdown()):
    return get_links_from_md_in_list(
        glob.glob(os.path.join(dir_path, '**', '*.md'), recursive=True),
        markdowner
    )

In [5]:
with open('data/src/md/test1.md') as f:
    print(f.read())


[Instagram](https://www.instagram.com/) and [Twitter](https://twitter.com)

- [[Py] Python.org](https://www.python.org/)
- [relative link](../test/)


In [6]:
pprint.pprint(get_links_from_md('data/src/md/test1.md'))


[['data/src/md/test1.md', 'Instagram', 'https://www.instagram.com/'],
 ['data/src/md/test1.md', 'Twitter', 'https://twitter.com'],
 ['data/src/md/test1.md', '[Py] Python.org', 'https://www.python.org/'],
 ['data/src/md/test1.md', 'relative link', '../test/']]

In [7]:
with open('data/src/md/test1.md') as f:
    md = f.read()

In [8]:
markdowner = markdown2.Markdown()
html = markdowner.convert(md)
print(html)


<p><a href="https://www.instagram.com/">Instagram</a> and <a href="https://twitter.com">Twitter</a></p>

<ul>
<li><a href="https://www.python.org/">[Py] Python.org</a></li>
<li><a href="../test/">relative link</a></li>
</ul>


In [9]:
l = BeautifulSoup(html, 'html.parser').find_all('a')
pprint.pprint(l)


[<a href="https://www.instagram.com/">Instagram</a>,
 <a href="https://twitter.com">Twitter</a>,
 <a href="https://www.python.org/">[Py] Python.org</a>,
 <a href="../test/">relative link</a>]

In [10]:
a = l[0]
print(type(a))


<class 'bs4.element.Tag'>

In [11]:
print(a.attrs)


{'href': 'https://www.instagram.com/'}

In [12]:
print(a.attrs.get('href'))


https://www.instagram.com/

In [13]:
print(a.text)


Instagram

In [14]:
html_en = markdowner.convert('abcde')
print(html_en)


<p>abcde</p>


In [15]:
print(BeautifulSoup(html_en, 'html.parser'))


<p>abcde</p>


In [16]:
print(BeautifulSoup(html_en, 'lxml'))


<html><body><p>abcde</p>
</body></html>

In [17]:
html_jp = markdowner.convert('あいうえお')
print(html_jp)


<p>あいうえお</p>


In [18]:
print(BeautifulSoup(html_jp, 'html.parser'))


<p>あいうえお</p>


In [19]:
print(BeautifulSoup(html_jp, 'lxml'))




In [20]:
print(BeautifulSoup(html_jp.encode(), 'lxml'))


<html><body><p>あいうえお</p>
</body></html>

In [21]:
print(BeautifulSoup('<p>abcdeあいうえお</p>', 'lxml'))


<html><body><p>abcdeあいうえお</p></body></html>

In [22]:
print(BeautifulSoup('<html>' + html_jp + '</html>', 'lxml'))


<html><body><p>あいうえお</p>
</body></html>

In [23]:
print(BeautifulSoup('abcde' + html_jp, 'lxml'))


<html><body><p>abcde</p><p>あいうえお</p>
</body></html>

In [24]:
pprint.pprint(get_links_from_md_in_list(glob.glob('data/src/md/*.md')))


[['data/src/md/test2.md', '[Py] Python.org', 'https://www.python.org/'],
 ['data/src/md/test2.md', 'relative link', '../test/'],
 ['data/src/md/test1.md', 'Instagram', 'https://www.instagram.com/'],
 ['data/src/md/test1.md', 'Twitter', 'https://twitter.com'],
 ['data/src/md/test1.md', '[Py] Python.org', 'https://www.python.org/'],
 ['data/src/md/test1.md', 'relative link', '../test/']]

In [25]:
pprint.pprint(get_links_from_md_in_dir('data/src/md/'))


[['data/src/md/test2.md', '[Py] Python.org', 'https://www.python.org/'],
 ['data/src/md/test2.md', 'relative link', '../test/'],
 ['data/src/md/test1.md', 'Instagram', 'https://www.instagram.com/'],
 ['data/src/md/test1.md', 'Twitter', 'https://twitter.com'],
 ['data/src/md/test1.md', '[Py] Python.org', 'https://www.python.org/'],
 ['data/src/md/test1.md', 'relative link', '../test/'],
 ['data/src/md/sub_dir/test_sub.md', 'Instagram', 'https://www.instagram.com/'],
 ['data/src/md/sub_dir/test_sub.md', 'Twitter', 'https://twitter.com']]

In [26]:
l = get_links_from_md('data/src/md/test1.md')
l.insert(0, ['file', 'anchor text', 'URL'])

with open('data/temp/md_links_csv.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(l)

In [27]:
l = get_links_from_md('data/src/md/test1.md')
df = pd.DataFrame(l, columns=['file', 'anchor text', 'URL'])
print(df)


                   file      anchor text                         URL
0  data/src/md/test1.md        Instagram  https://www.instagram.com/
1  data/src/md/test1.md          Twitter         https://twitter.com
2  data/src/md/test1.md  [Py] Python.org     https://www.python.org/
3  data/src/md/test1.md    relative link                    ../test/

In [28]:
df.to_csv('data/temp/md_links_df.csv', index=False)