In [12]:
import re
import yaml
import sys
import struct
from lxml import etree
parser = etree.HTMLParser()
#qualifiers_page = etree.parse('http://emboss.open-bio.org/html/use/apcs02.html', parser)
#attributes_resp = etree.parse('http://emboss.open-bio.org/html/dev/apas05.html', parser)
datatypes_page = etree.parse('http://emboss.sourceforge.net/developers/acd/syntax.html', parser)
datatype = ''
maxint = 2 ** (struct.Struct('i').size * 8 - 1) - 1
TYPES = {
'integer':int,
'Y/N': bool,
'string':str,
'float': float,
'boolean':bool
}
VALUES = {
'0':0,
'1':1,
'N':False,
'Y':True,
'""':'',
'':'',
'FLT_MAX':sys.float_info.max,
'-FLT_MAX':-sys.float_info.max,
'INT_MIN':maxint - 1,
'INT_MAX':maxint,
}
datatypes_reference = {}
#parse datatypes and descriptions and input/output
for line in datatypes_page.xpath('//a[@name="sect23"]/following-sibling::table/tr[td[not(@colspan)]/p/b/text()]'):
datatype_res = line.xpath('td/p/b/text()')
if len(datatype_res)!=1:
continue
datatype = str(datatype_res[0])
# guess if the datatype is an input and output by parsing the previous "subsection" table line...
if line.xpath('./preceding-sibling::tr[td/@colspan="5"][1]/td/p/b/text()')[0] in ["Output types", "Graphics types"]:
parameter_type = "OUTPUT"
else:
parameter_type = "INPUT"
description = str(line.xpath('td[2]/p/text()')[0])
datatypes_reference[datatype] = {'description': description,'qualifiers':{},'attributes':{}, 'type': parameter_type}
# parse qualifiers
datatype = ""
for line in datatypes_page.xpath('//a[@name="sect2531" or @name="sect2532" or @name="sect2533"]/following-sibling::table[1]/tr[not(td/p/b)]'):
datatype_res = line.xpath('td[1]/p/text()')
if len(datatype_res)!=1:
continue
datatype = str(datatype_res[0].strip()) if str(datatype_res[0].strip())!='' else datatype
qualifier_list = line.xpath('./td[2]/p/text()')
qualifier = qualifier_list[0].strip()[1:-1]
value_type_list = line.xpath('./td[2]/p/i/text()')
value_type = TYPES[value_type_list[0]]
description_default_list = line.xpath('./td[3]/p/text()')
description = description_default_list[0]
default_value = VALUES[description_default_list[1].replace('Default: ','').strip()]
datatypes_reference[datatype]['qualifiers'][qualifier]={
'value_type':str(value_type().__class__.__name__),
'description':str(description),
'default_value':default_value
}
# parse attributes
datatype = ""
for line in datatypes_page.xpath('////h5[starts-with(text(),"2.4.1.2.1") or starts-with(text(),"2.4.1.2.2") or starts-with(text(),"2.4.1.2.3") or starts-with(text(),"2.4.1.2.4") or starts-with(text(),"2.4.1.2.5")]/following-sibling::table[1]/tr[not(td/p/b)]'):
datatype_res = line.xpath('td[1]/p/text()')
if len(datatype_res)!=1:
continue
datatype = str(datatype_res[0].strip()) if str(datatype_res[0].strip())!='' else datatype
attribute_list = line.xpath('./td[2]/p/text()')
attribute = attribute_list[0].strip()[:-1]
value_type_list = line.xpath('./td[2]/p/i/text()')
value_type = TYPES[value_type_list[0]]
description_default_list = line.xpath('./td[3]/p//text()')
description = description_default_list[0]
if (len(description_default_list)>2):
pre_value = description_default_list[2][1:-1]
else:
pre_value = description_default_list[1].replace('Default: ','').strip()
default_value = VALUES.get(pre_value)
if default_value is None:
default_value = value_type(pre_value)
#print(datatype, attribute, value_type().__class__.__name__)
datatypes_reference[datatype]['attributes'][attribute] = {
'default_value': default_value,
'value_type': str(value_type().__class__.__name__),
'description': str(description)
}
with open('datatypes_reference.yml', 'w') as outfile:
yaml.safe_dump(datatypes_reference, outfile)