In [1]:
from bs4 import BeautifulSoup
import os
import glob
In [ ]:
i = 0
with open("all_packages_metadata.csv", "w") as fo:
fo.write(";".join(["Package",
"Version:",
'Depends:',
'Imports:',
'Published:',
'Author:',
'Maintainer:',
'License:']) + "\n")
for myfile in glob.glob("raw_html/*.html"):
with open(myfile) as f:
html = f.read()
html = BeautifulSoup(html, "lxml")
table = html.find_all('table')[0]
values = ['NA'] * 8
values[0] = os.path.basename(myfile)[:-5]
for row in table.find_all('tr'):
columns = row.find_all('td')
tmp = []
for column in columns:
tmp.append(column.get_text().replace("\n", " ").replace(";", ",").replace('"',""))
if tmp[0] == "Version:":
values[1] = tmp[1]
elif tmp[0] == "Depends:":
values[2] = tmp[1]
elif tmp[0] == "Imports:":
values[3] = tmp[1]
elif tmp[0] == "Published:":
values[4] = tmp[1]
elif tmp[0] == "Author:":
values[5] = tmp[1]
elif tmp[0] == "Maintainer:":
values[6] = tmp[1]
elif tmp[0] == "License:":
values[7] = tmp[1]
fo.write(";".join(values) + "\n")
if i % 100 == 0:
print(i, values[0])
i = i + 1
In [ ]: