In [1]:
from bs4 import BeautifulSoup, NavigableString, Tag
fname = "resources/outputfiles/V_7_2/5ZoneCAVtoVAVWarmestTempFlowTable_ABUPS.html"
soup = BeautifulSoup(open(fname, 'r'))
In [2]:
btables = soup.find_all(['p', 'table', 'hr'])
In [3]:
atables = soup.hr.find_all(['p', 'table', 'hr'])
In [4]:
for i, t in enumerate(btables):
if t.name == 'hr':
for j in t:
# print j.name
pass
In [5]:
ctables = soup.hr.find_all()
In [6]:
for child in soup.body:
print child.name
In [7]:
len(soup)
Out[7]:
In [8]:
for i in soup:
In [9]:
soup.find_all('p')
Out[9]:
In [10]:
print soup.next
In [11]:
p = soup.find_all('p')
In [12]:
p[0].children.next()
Out[12]:
In [13]:
hr = soup.hr.children
In [14]:
hr.next()
Out[14]:
In [15]:
for i in hr:
print i.name
In [16]:
hr = soup.hr.children
In [17]:
br = soup.hr.br
In [17]:
In [18]:
for i in soup.hr.descendants:
if i.name not in ( 'tr', 'td', None, 'br'):
if i.name == 'p':
print i
In [19]:
pr = soup.table.next_siblings
In [20]:
for i in pr:
print i.name
In [21]:
print pr.previous_sibling
In [22]:
pr = soup.table.previous_elements
In [41]:
for i in pr:
try:
name = i.name
except AttributeError, e:
continue
if i.name not in ('br', None):
if i.name == 'hr':
break
print i
In [24]:
tb = soup.find_all('table')
In [25]:
for el in soup.table.next:
if el.name == 'table':
print el.name
In [26]:
d = soup.body.next_element
In [27]:
e = soup.table.next_elements
In [51]:
def has_name(i):
try:
name = i.name
return True
except AttributeError, e:
return False
pr = soup.table.previous_elements
for i in pr:
if not has_name(i):
continue
if i.name not in ('br', None):
if i.name == 'hr':
break
print i.name
e = soup.table.next_elements
for j in e:
if not has_name(j):
continue
if j.name == 'table':
print j.name
pr = j.previous_elements
for i in pr:
if not has_name(i):
continue
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
print i.name
In [52]:
def has_name(i):
try:
name = i.name
return True
except AttributeError, e:
return False
e = soup.p.next_elements
for j in e:
if not has_name(j):
continue
if j.name == 'table':
print j.name
pr = j.previous_elements
for i in pr:
if not has_name(i):
continue
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
print i.name
In [54]:
def has_name(i):
try:
name = i.name
return True
except AttributeError, e:
return False
all = []
e = soup.p.next_elements
for j in e:
tabletup = []
if not has_name(j):
continue
if j.name == 'table':
print j.name
beforetable = []
pr = j.previous_elements
for i in pr:
if not has_name(i):
continue
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
print i.name
beforetable.append(i.name)
beforetable.reverse()
tabletup.append(beforetable)
tabletup.append(j.name)
if tabletup:
all.append(tabletup)
In [56]:
def has_name(i):
try:
name = i.name
return True
except AttributeError, e:
return False
all = []
e = soup.p.next_elements
for j in e:
tabletup = []
if not has_name(j):
continue
if j.name == 'table':
print j.name
beforetable = []
pr = j.previous_elements
for i in pr:
if not has_name(i):
continue
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
print i.name
beforetable.append(i)
beforetable.reverse()
tabletup.append(beforetable)
tabletup.append(j)
if tabletup:
all.append(tabletup)
In [67]:
for line in all[0][0]:
if line.name == 'p':
print line.get_text()
In [81]:
fname = "resources/outputfiles/V_8_1/1050PageMillRoadTable.html"
soup = BeautifulSoup(open(fname, 'r'))
In [82]:
all = []
e = soup.p.next_elements
for j in e:
tabletup = []
if not has_name(j):
continue
if j.name == 'table':
print j.name
beforetable = []
pr = j.previous_elements
for i in pr:
if not has_name(i):
continue
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
if i.parent.name == "p":
print i.name, "parent", i.parent.name
else:
print i.name
beforetable.append(i)
beforetable.reverse()
tabletup.append(beforetable)
tabletup.append(j)
if tabletup:
all.append(tabletup)
In [3]:
def _has_name(soup_obj):
"""checks if soup_obj is really a soup object or just a string
If it has a name it is a soup object"""
try:
name = soup_obj.name
return True
except AttributeError, e:
return False
linestables = []
e = soup.p.next_elements
for j in e:
tabletup = []
if not _has_name(j):
continue
if j.name == 'table':
print j.name
beforetable = []
pr = j.previous_elements
for i in pr:
if not _has_name(i):
continue
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
if i.parent.name == "p":
print i.name, "parent", i.parent.name
else:
print i.name
beforetable.append(i)
beforetable.reverse()
tabletup.append(beforetable)
tabletup.append(j)
if tabletup:
linestables.append(tabletup)
In [5]:
for block in linestables:
for line in block[0]:
print line.get_text()
print '- Table -----'
In [106]:
pk = ([(i, line.get_text(), block[-1]) for line in block[0] if 'PEAK' in line.get_text()] for i, block in enumerate(all))
imgs = [str(p[-1]) for p in pk if p]
HTML(imgs[1])
Out[106]:
In [104]:
Out[104]:
In [69]:
pr = soup.table.previous_elements
for i in pr:
if i.name not in ('br', None):
if i.name == 'hr':
break
print i.get_text()
e = soup.table.next_elements
for j in e:
if j.name == 'table':
print j.name
pr = j.previous_elements
for i in pr:
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
print i.get_text()
In [30]:
pr = soup.table.previous_elements
for i in pr:
if i.name not in ('br', None):
if i.name == 'hr':
break
if i.name == 'p':
print i.get_text()
e = soup.table.next_elements
for j in e:
if j.name == 'table':
print j.name
pr = j.previous_elements
for i in pr:
if i.name not in ('br', None):
if i.name in ('table', 'hr', 'tr', 'td'):
break
print i.get_text()
In [31]:
soup.table.previousSibling?
In [32]:
from IPython.display import HTML
In [33]:
print str(soup.table)
In [34]:
HTML(str(soup.table))
Out[34]:
In [35]:
for t in soup.table.next_elements:
if t.name == 'table':
HTML(str(t))
break
In [36]:
HTML(str(t))
Out[36]:
In [37]:
ts = soup.table.next_elements
In [38]:
l = [str(i) for i in ts if i.name == 'table']
In [39]:
HTML(l[-1])
In [ ]: