notebook.community

Edit and run



In [3]:

    
string = 'some text<a href="http://www.somesite.com/a/page"> <p> The red color. <br /> <img src="some/url/to/image" /> </p></a>some final text<ref href="http://www.somesite.com/a/page"> <p> The blue color. <br /> <img src="some/url/to/image" /> </p></ref>'



In [35]:

    
import re

TAG_REGEX = re.compile(r"<(\w+).+?/\1\s*>", flags=re.DOTALL)

def remove_html_tags(text):
    pos = 0
    start_pos = 0
    last_block_end = 0
    new_string = ""
    while pos < len(text):
        if text[pos] == "<":
            start_pos = pos
        elif text[pos:pos+2] == "/>":
            new_string += text[last_block_end:start_pos] + " "
            pos += 2
            last_block_end = pos
        pos += 1
    new_string += text[last_block_end:]
    text = new_string
    
    new_string = ""
    last_end = 0
    for match in TAG_REGEX.finditer(text):
        new_string += text[last_end:match.start(0)] + " " 
        last_end = match.end(0)
    new_string += text[last_end:]
    
    return new_string



In [36]:

    
result = remove_html_tags(string)



In [37]:

    
print result









    



some text some final text



In [ ]: