In [3]:
string = 'some text<a href="http://www.somesite.com/a/page"> <p> The red color. <br /> <img src="some/url/to/image" /> </p></a>some final text<ref href="http://www.somesite.com/a/page"> <p> The blue color. <br /> <img src="some/url/to/image" /> </p></ref>'
In [35]:
import re
TAG_REGEX = re.compile(r"<(\w+).+?/\1\s*>", flags=re.DOTALL)
def remove_html_tags(text):
pos = 0
start_pos = 0
last_block_end = 0
new_string = ""
while pos < len(text):
if text[pos] == "<":
start_pos = pos
elif text[pos:pos+2] == "/>":
new_string += text[last_block_end:start_pos] + " "
pos += 2
last_block_end = pos
pos += 1
new_string += text[last_block_end:]
text = new_string
new_string = ""
last_end = 0
for match in TAG_REGEX.finditer(text):
new_string += text[last_end:match.start(0)] + " "
last_end = match.end(0)
new_string += text[last_end:]
return new_string
In [36]:
result = remove_html_tags(string)
In [37]:
print result
In [ ]: