In [1]:
from bs4 import BeautifulSoup
import os
import glob

In [ ]:
i = 0
with open("all_packages_metadata.csv", "w") as fo:
    fo.write(";".join(["Package", 
                          "Version:",
                          'Depends:',
                          'Imports:',
                          'Published:',
                          'Author:',
                          'Maintainer:',
                          'License:']) + "\n")
    for myfile in glob.glob("raw_html/*.html"):
        with open(myfile) as f:
            html = f.read()
        html = BeautifulSoup(html, "lxml")
        table = html.find_all('table')[0]
        values = ['NA'] * 8
        values[0] = os.path.basename(myfile)[:-5]
        for row in table.find_all('tr'):
            columns = row.find_all('td')
            tmp = []
            for column in columns:
                tmp.append(column.get_text().replace("\n", " ").replace(";", ",").replace('"',""))
            if tmp[0] == "Version:":
                values[1] = tmp[1]
            elif tmp[0] == "Depends:":
                values[2] = tmp[1]
            elif tmp[0] == "Imports:":
                values[3] = tmp[1]
            elif tmp[0] == "Published:":
                values[4] = tmp[1]
            elif tmp[0] == "Author:":
                values[5] = tmp[1]
            elif tmp[0] == "Maintainer:":
                values[6] = tmp[1]
            elif tmp[0] == "License:":
                values[7] = tmp[1]
        fo.write(";".join(values) + "\n")
        if i % 100 == 0:
            print(i, values[0])
        i = i + 1


0 bitrugs
100 SensitivityCaseControl
200 msgps
300 ABHgenotypeR
400 signmedian.test
500 SMC
600 vembedr
700 SurvRegCensCov
800

In [ ]: