In [12]:
import re
import yaml
import sys
import struct

from lxml import etree

parser = etree.HTMLParser()
#qualifiers_page = etree.parse('http://emboss.open-bio.org/html/use/apcs02.html', parser)
#attributes_resp = etree.parse('http://emboss.open-bio.org/html/dev/apas05.html', parser)
datatypes_page = etree.parse('http://emboss.sourceforge.net/developers/acd/syntax.html', parser)
datatype = ''
maxint = 2 ** (struct.Struct('i').size * 8 - 1) - 1
TYPES = {
    'integer':int,
    'Y/N': bool,
    'string':str,
    'float': float,
    'boolean':bool
}
VALUES = {
    '0':0,
    '1':1,
    'N':False,
    'Y':True,
    '""':'',
    '':'',
    'FLT_MAX':sys.float_info.max,
    '-FLT_MAX':-sys.float_info.max,
    'INT_MIN':maxint - 1,
    'INT_MAX':maxint,    
}

datatypes_reference = {}

#parse datatypes and descriptions and input/output
for line in datatypes_page.xpath('//a[@name="sect23"]/following-sibling::table/tr[td[not(@colspan)]/p/b/text()]'):
    datatype_res = line.xpath('td/p/b/text()')
    if len(datatype_res)!=1:
        continue
    datatype = str(datatype_res[0])
    # guess if the datatype is an input and output by parsing the previous "subsection" table line...
    if line.xpath('./preceding-sibling::tr[td/@colspan="5"][1]/td/p/b/text()')[0] in ["Output types", "Graphics types"]:
        parameter_type = "OUTPUT"
    else:
        parameter_type = "INPUT"
    description = str(line.xpath('td[2]/p/text()')[0])
    datatypes_reference[datatype] = {'description': description,'qualifiers':{},'attributes':{}, 'type': parameter_type}

# parse qualifiers
datatype = ""
for line in datatypes_page.xpath('//a[@name="sect2531" or @name="sect2532" or @name="sect2533"]/following-sibling::table[1]/tr[not(td/p/b)]'):
    datatype_res = line.xpath('td[1]/p/text()')
    if len(datatype_res)!=1:
        continue
    datatype = str(datatype_res[0].strip()) if str(datatype_res[0].strip())!='' else datatype
    qualifier_list = line.xpath('./td[2]/p/text()')
    qualifier = qualifier_list[0].strip()[1:-1]
    value_type_list = line.xpath('./td[2]/p/i/text()')
    value_type = TYPES[value_type_list[0]]
    description_default_list = line.xpath('./td[3]/p/text()')
    description = description_default_list[0]
    default_value = VALUES[description_default_list[1].replace('Default: ','').strip()]
    datatypes_reference[datatype]['qualifiers'][qualifier]={
        'value_type':str(value_type().__class__.__name__),
        'description':str(description),
        'default_value':default_value
    }

# parse attributes    
datatype = ""
for line in datatypes_page.xpath('////h5[starts-with(text(),"2.4.1.2.1") or starts-with(text(),"2.4.1.2.2") or starts-with(text(),"2.4.1.2.3") or starts-with(text(),"2.4.1.2.4") or starts-with(text(),"2.4.1.2.5")]/following-sibling::table[1]/tr[not(td/p/b)]'):
    datatype_res = line.xpath('td[1]/p/text()')
    if len(datatype_res)!=1:
        continue
    datatype = str(datatype_res[0].strip()) if str(datatype_res[0].strip())!='' else datatype
    attribute_list = line.xpath('./td[2]/p/text()')
    attribute = attribute_list[0].strip()[:-1]
    value_type_list = line.xpath('./td[2]/p/i/text()')
    value_type = TYPES[value_type_list[0]]
    description_default_list = line.xpath('./td[3]/p//text()')
    description = description_default_list[0]
    if (len(description_default_list)>2):
        pre_value = description_default_list[2][1:-1]
    else:
        pre_value = description_default_list[1].replace('Default: ','').strip()
    default_value = VALUES.get(pre_value)
    if default_value is None:
        default_value = value_type(pre_value)
    #print(datatype, attribute, value_type().__class__.__name__)
    datatypes_reference[datatype]['attributes'][attribute] = {
        'default_value': default_value,
        'value_type': str(value_type().__class__.__name__),
        'description': str(description)
    }
with open('datatypes_reference.yml', 'w') as outfile:
    yaml.safe_dump(datatypes_reference, outfile)