In [160]:
using LibExpat
In [161]:
names(LibExpat)
Out[161]:
In [162]:
sm = """<blah id="42" class="top">hi
<blue id="1" class="cold">hey</blue>
<red id="2" class="hot">yo</red>
</blah>"""
Out[162]:
In [162]:
et=xp_parse(s);
Let's check the structure of a simple ETree
In [163]:
esm = xp_parse(sm)
dump(esm)
In [164]:
esm.name, esm.attr
Out[164]:
In [165]:
esm.elements
Out[165]:
In [166]:
typeof(esm.elements[1]) <: String
Out[166]:
In [167]:
for e in esm.elements
stre = strip(string(e))
if length(stre)>0
println(stre, " ", typeof(e))
if typeof(e) <: String
println("Payload: ",stre)
end
end
end
In [167]:
s="""<div id="flight_container" style="padding: 2px;">
<table class="table_sides" width="100%" cellpadding="0" cellspacing="0" border="0" align=""><tbody><tr>
<td bgcolor="FFFFFF">
<table width="100%" border="0" cellpadding="4" cellspacing="0" class=""><thead>
<tr><td colspan="15" class="table_header" align="left">Flight Info - NXXXXXX(Rogers Bleeblah #) </td></tr>
<tr>
<td width="" class="table_row_header" align="left" valign="middle">Date</td>
<td width="" class="table_row_header" align="left" valign="middle">Origin</td>
<td width="" class="table_row_header" align="left" valign="middle">Dest</td>
<td width="" class="table_row_header" align="left" valign="middle">Depart</td>
<td width="" class="table_row_header" align="left" valign="middle">Arrive</td>
<td width="" class="table_row_header" align="left" valign="middle">Hobbs</td>
<td width="" class="table_row_header" align="left" valign="middle">Flight Time</td>
<td width="" class="table_row_header" align="left" valign="middle">Ground Time</td>
<td width="" class="table_row_header" align="left" valign="middle">Flight Distance</td>
<td width="" class="table_row_header" align="left" valign="middle">Taxi Distance</td>
<td width="" class="table_row_header" align="left" valign="middle">Fuel</td>
<td width="" class="table_row_header" align="left" valign="middle">Fuel/hr</td>
<td width="" class="table_row_header" align="left" valign="middle">Fuel/nm</td>
<td width="" class="table_row_header" align="left" valign="middle">Altitude</td>
<td width="" class="table_row_header" align="left" valign="middle">Gnd Speed</td>
</tr></thead><tbody>
<tr class="table_row1" onmouseover="style.backgroundColor='#FFF9C4'" onmouseout="style.backgroundColor='#FFFFFF'">
<td width="" class="table_td" align="left" valign="top">Mon, May xx, 2010</td>
<td width="" class="table_td" align="left" valign="top">KMYF</td>
<td width="" class="table_td" align="left" valign="top">XXXX</td>
<td width="" class="table_td" align="left" valign="top">10:44</td>
<td width="" class="table_td" align="left" valign="top">12:43</td>
<td width="" class="table_td" align="left" valign="top">1.92 hrs</td>
<td width="" class="table_td" align="left" valign="top">1.8 hrs (1:48)</td>
<td width="" class="table_td" align="left" valign="top">0.12 hrs (0:07)</td>
<td width="" class="table_td" align="left" valign="top">177.27 nm</td>
<td width="" class="table_td" align="left" valign="top">1.32 nm</td>
<td width="" class="table_td" align="left" valign="top">16.69 gal</td>
<td width="" class="table_td" align="left" valign="top">8.68 gal/hr</td>
<td width="" class="table_td" align="left" valign="top">0.09 gal/nm</td>
<td width="" class="table_td" align="left" valign="top">9511 msl</td>
<td width="" class="table_td" align="left" valign="top">95.21 kts</td>
</tr>
</tbody></table>
</td></tr></tbody></table>
</div>
""";
In [168]:
tds = LibExpat.find(et, "/div/table//table//td")
Out[168]:
In [169]:
el = tds[1]
Out[169]:
In [170]:
typeof(el)
Out[170]:
Just get the text of the element:
In [171]:
string(el)
Out[171]:
Check the attribute Dict to identifier elements by class
In [172]:
el.attr["class"]
Out[172]:
In [173]:
get(el.attr, "class","")
Out[173]:
Get the flight acid
In [174]:
function parse_header( hdr )
#hdr = strip(td.elements[1])
hdr = strip( split(hdr,'-')[2] )
(acid, actype) = [strip(s) for s in split(hdr,'(')]
actype = strip(replace(actype, "#)",""))
return (acid, actype)
end
Out[174]:
In [175]:
parse_header( "Flight Info - NXXXXXX (Rogers Bleeblah #) " )
Out[175]:
Extract element payloads
In [176]:
labels = ASCIIString[]
values = ASCIIString[]
hdr = ""
for td in tds
if get(td.attr,"class","")=="table_header"
hdr = strip(td.elements[1])
(acid, actype) = parse_header(hdr)
end
if get(td.attr,"class","")=="table_td"
push!(values, strip(td.elements[1]) )
end
if get(td.attr,"class","")=="table_row_header"
push!(labels, strip(td.elements[1]) )
end
end
In [177]:
acid, actype
Out[177]:
Load to Dict()
In [178]:
dmap = Dict()
for (i,el) in enumerate(labels)
v = values[i]
if '0'<=v[end]<='9'
dmap[el] = v
else
dmap[el] = split(v,' ')[1]
end
end
dump(dmap)
In [159]: