This notebook is intended enable testing and evaluation of various extractor methods such as:

  • Heuristic Extractor
  • CRF Extractor
  • boilerpipe

In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [2]:
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )

In [3]:
import sys
sys.path.append('../../foreign_modules/python/')

In [4]:
loc_key = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'

In [5]:
import mediacloud, requests, csv, sys, os, json, cPickle

def get_download( downloads_id ):
    download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
    return download.json()[0]

def extract_story( preprocessed_lines, title, description, extractor_method ):
    extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 
                           'story_title':title, 'story_description':description, 'extractor_method': extractor_method}
    
    extract_result = requests.put('http://0:3000/api/v2/extractlines/extract',data=json.dumps(extract_params), 
                                headers = {'Content-type': 'application/json'})
    
    extract_result.raise_for_status()
    return extract_result.json()

def get_story_lines( raw_content ):
    story_lines_params = {'key':loc_key, 'body_html':raw_content }
    headers = {'Content-type': 'application/json'}
    story_lines = requests.put('http://0:3000/api/v2/extractlines/story_lines',data=json.dumps(story_lines_params), 
                               params={ 'key': loc_key },headers=headers)
    
    story_lines.raise_for_status()
    
    return story_lines

In [6]:
import subprocess
import tempfile
import codecs
import time
from lxml import html

#download = get_download( downloads_id )
#raw_content = download[u'raw_content']

def extract_with_boilerpipe( raw_content ):
    with tempfile.NamedTemporaryFile( suffix='.html', delete=False ) as t:
        #print t.name
    
        UTF8Writer = codecs.getwriter('utf8')
        t.file = UTF8Writer(t.file)
        t.file.write( raw_content )
    
        t.close()
        #time.sleep( 2 )
        print "original article tmp file ", t.name
        
        #input_file = '/tmp/416655019.htm'
        input_file = t.name
        
        output_tmp = tempfile.NamedTemporaryFile( suffix='.html', delete=False )
        
        output_file = output_tmp.name
        #output_file = '/tmp/highlighted.html'
        print output_file
        print subprocess.check_output(['java', '-jar',
                               '/home/dlarochelle/dev_scratch/boilerpipe_test/out/artifacts/boilerpipe_test_jar/boilerpipe_test.jar',
                               input_file, output_file ] )
        f = open( output_file, 'rb' )
        
        annotated_file_str = f.read()
        
        #t.unlink( t.name )
        output_tmp.close()
        #output_tmp.unlink( output_tmp.name )
        
    return annotated_file_str

In [7]:
import readability

def extract_with_python_readability( raw_content ):
    doc = readability.Document( raw_content )
    
    return [ doc.short_title(),
             doc.summary() ]

In [8]:
import goose

def extract_with_python_goose( raw_content ):
    g = goose.Goose()
    
    r = g.extract( raw_html=raw_content )
    return [r.title, r.cleaned_text ]

In [10]:
import justext

def extract_with_justext( raw_content ):
    ret = []
    
    paragraphs = justext.justext( raw_content, justext.get_stoplist('Portuguese') )
    
    #p = paragraphs[0]
    for p in paragraphs:
        if not p.is_boilerplate:
            ret.append(p.text)
            
    return ret

#extract_with_justext( raw_content )
#raw_html

#justext.get_stoplists()

In [11]:
import operator

def get_extractor_training_text( downloads_id, preprocessed_lines ):
    extractor_training_lines_result = requests.get(
                                                   'https://api.mediacloud.org/api/v2/extractlines/extractor_training_lines/' + str(downloads_id),
                                               headers = {'Content-type': 'application/json'}
                                               , params= {'key': api_key}
                                               )

    extractor_training_lines_result.raise_for_status()

    extractor_training_lines_result = extractor_training_lines_result.json()
    
    line_numbers = [ x['line_number'] for x in extractor_training_lines_result ]
    
    line_numbers = sorted(line_numbers)
    
    line_numbers.sort()
    
    #print line_numbers
    
    return operator.itemgetter( * line_numbers )( preprocessed_lines  )

import operator

def get_extracted_text( extractor_results ):
    included_line_numbers = extractor_results['included_line_numbers']
    #print included_line_numbers
    
    dl = extractor_results['download_lines']
   
    if len( included_line_numbers ) == 0:
        return []
    else:    
        return operator.itemgetter( * extractor_results['included_line_numbers']   )(dl)

In [12]:
def html_strip( str ):
    if str.isspace() or str == '':
        return u' '
    
    if str == '<':
        return u' '        
    
    try:
        return html.fromstring(str).text_content()    
    except:
        print "Unexpected error on string '" + str + "'" , sys.exc_info()[0]
        #raise
        return u''
    
    

def clean_for_comparison( str ):
    if len(str) > 0:
        ret = html_strip( str )
    else:
        return str
    
    if len(ret) > 0:
        ret = ret.strip()
    
    return ret

In [13]:
import Levenshtein

def lines_to_comparable_text( lines ):
    text = " ".join([ clean_for_comparison(line) for line in lines ])
    
    if text == '':
        text = u''
        
    return text

def compare_accuracy( lines, lines_expected ):
    return Levenshtein.distance( lines_to_comparable_text( lines ) , lines_to_comparable_text( lines_expected ) )

In [14]:
def get_anncestors( element ):
    anncestors = [ element ];
    anncestor = element.getparent()
    
    while anncestor != None :
        #print 'loop'
        anncestors.append( anncestor )
        anncestor = anncestor.getparent()
        
    return anncestors

In [15]:
def text_from_lxml_object( obj):
    if type(obj) is etree._ElementStringResult:
        return u'' + obj
    if type(obj) ==  etree._ElementUnicodeResult:
        return u'' + obj 
    else:
        try:
            return etree.tostring( obj , method='text', encoding="UTF-8") 
        except:
            print type(obj)
            print obj
            
            raise ''

In [16]:
from lxml import etree

downloads_id =  582817308 
download = get_download( downloads_id )
raw_content = download[ 'raw_content' ]
with open( '/tmp/' + str(downloads_id) , 'wb' ) as f:
    f.write( raw_content )

In [17]:
def text_children( element):
    ret =  [ t for t in element.xpath("//text()" ) if t.getparent() == element ]
    assert len( ret ) <= 2
    
    if len( ret ) == 2:
        assert ret[0].is_text
        assert ret[1].is_tail
    
    for r in ret:
        if r.is_text:
            assert element.text == r
        else:
            assert r.is_tail
            assert element.tail == r
            
    return ret

In [17]:
#start_container[ annotation['start_offset']: annotation['end_offset'] + 1 ]

In [49]:
def get_annotated_text( raw_content, annotation):

    htmlparser = etree.HTMLParser()
    root = etree.fromstring( raw_content, htmlparser )
    
    #print annotation[ 'start_xpath' ]
    
    start_container = get_element_from_xpath(root, annotation[ 'start_xpath' ] )
    end_container   = get_element_from_xpath( root, annotation[ 'end_xpath' ] )
    
    if ( start_container == end_container ):
        return [start_container[ annotation['start_offset']: annotation['end_offset'] + 1 ]]
    
        
    if start_container.getparent() == end_container.getparent():
        common_parent = start_container.getparent()
        assert start_container.is_text
        assert end_container.is_tail
        assert common_parent.text == start_container
        assert common_parent.tail == end_container
        
        return [start_container[ annotation['start_offset']:], end_container[ : annotation['end_offset'] + 1 ]]

    
    start_anncestors = get_anncestors( start_container )
    end_anncestors   = get_anncestors( end_container )
    
    print 'sc', start_container
    print 'ec', end_container
    print 'common'
    
    middle_contents = []
    
    p = start_container.getparent()
    prev_p = start_container
    
    if start_container.is_text:
        # append the tail
        texts = text_children( start_container.getparent() )
        if len( texts ) == 2:
            assert texts[1].is_tail
            middle_contents.append( texts[1] )
                                   
    
    while p not in end_anncestors:
        #print "parent:", p, "\n", etree.tostring( p )
        assert p in start_anncestors
        if prev_p != start_container:
            child_index = p.index( prev_p )
            if (child_index + 1) < len( list( p )):
                el = list(p)[ child_index + 1]
            else:
                el = None
                #print "stripping previous parent is the last child of curparrent"
        else:
            el = None
    
        while (el not in end_anncestors) and (el != None) :
            #print "inner loop"
            #print el
            #print etree.tostring( el )
            middle_contents.append( el )
            print el
            el = el.getnext()
            
        print "end inner loop"
        prev_p = p
        p = p.getparent()
    
    print "end loop"
    
    print p
    commonanncestors = list([ s for s in start_anncestors if s in end_anncestors ] )
    
    assert p in commonanncestors
    commonanncestor = commonanncestors[0]
    
    print commonanncestors
    print start_container == end_container
    assert p == commonanncestor
    
    print "commonacccestor", commonanncestor 
    print etree.tostring( commonanncestor )
       
    processed_children = [ c for c in list (commonanncestor) if c in start_anncestors ]
    print "ca's processed children"
    #print processed_children
    #print [ etree.tostring( c ) + "\n" for c in list( commonanncestor ) ] 
    assert( len( processed_children ) == 1 )
    processed_child = processed_children[ 0]
    
    print "processed_child", processed_child
    
    print etree.tostring( processed_child )
    
    el = processed_child.getnext()
    
    print "start True"
    
    assert el != None
    
    #print etree.tostring( el )
    
    while True:
        
            print 'outer loop:',  el, "\n", etree.tostring( el )



            while (el not in end_anncestors):
                print "inner loop"
                print el
                assert el != None
                print etree.tostring( el )
                middle_contents.append( el )
                el = el.getnext()
                assert el != None
                
            print 'end inner loop'
            print el
            print etree.tostring( el )
            # element is an ancester of end_container and has no (non-text)children
            # since end_container is text el must be it's parent so we can stop
            if (len(list(el)) == 0 ):
                            
                print 'found end_contain parent, exiting loop:',  el, "\n", etree.tostring( el )
                
                assert end_container in text_children(el)
    
                assert end_container.getparent() == el
                break
            
            ## HACK bc/ lxml/etree doesn't have a real text node
            ## treat the text as the first child node
            texts = text_children(el)
            
            assert ( all ( [t.getparent() == el for t in texts] ) )
            
            assert( len (texts ) <= 2 )
            if len( texts) > 0:
                if texts[0].is_text:
                    assert el.text == texts[0]
                    middle_contents.append(texts[0] );
    
            el = el[0]
        
            assert el != None
        
            
    #[ text_from_lxml_object( mc ) for mc in middle_contents ]
    print etree.tostring(el)
    #print etree.tostring(middle_contents[-1] )
    print end_container.is_tail
    print end_container.is_text
    print "escape while"
    #print list ( el.itertext() )
    #print 'ca'
    #commonanncestor.text
    #type(el)
    type( end_container )
    assert end_container in text_children(el)
    
    assert el == end_container.getparent()
    
    print etree.tostring( end_container.getparent() )
    #print middle_contents[-2:] 
    #print end_container
    text_children(el )
    etree.tostring( el.getparent() )
    #p_el = el.getparent()
    #print etree.tostring( p_el )
    #text_children( p_el )[0].is_text
    
    #annotation
    #list(p_el)
    if end_container.is_tail:
        assert len(text_children(el )) == 2
        
        middle_contents.append( text_children( el )[ 0] )
    
    else:
        assert end_container.is_text
        
    end_text = end_container[:annotation['end_offset'] - 1]
    
    print 'start container'
    print etree.tostring( start_container.getparent())
    print start_container
    print 'offset', annotation['start_offset']
    
    print 'end container'
    print etree.tostring( end_container.getparent())
    print end_container
    print 'offset', annotation['end_offset']
    
    #assert start_container.is_text
    
    start_text = start_container[annotation['start_offset']:]
    
    target_text = [ start_text ]
    target_text.extend( [ text_from_lxml_object( mc ) for mc in middle_contents ] )
    target_text.append( end_text )
    
    return target_text

In [50]:
import io

annotation = {u'end_offset': 67, u'end_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h1[1]/text()[1]', u'start_offset': 0, 
              u'start_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h1[1]/text()[1]'}

annotation = {u'end_offset': 142, u'end_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/div[1]/div[1]/div[1]/p[10]/text()[2]',
              u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/div[1]/div[1]/div[1]/p[1]/text()[1]'}
annotation = {u'end_offset': 1, u'end_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h5[1]/text()[3]',
              u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h5[1]/time[1]/text()[1]'}

downloads_id = 582815971
annotation = {u'end_offset': 123, u'end_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/p[1]/text()[1]', u'start_offset': 0,
              u'start_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/h1[1]/text()[1]'}

downloads_id = 413070223
annotation = {u'end_offset': 0, u'end_xpath': u'/html[1]/body[1]/div[2]/div[3]/div[1]/div[2]/div[2]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[3]/div[1]/div[2]/h1[1]/text()[1]'}
download = get_download( downloads_id )
raw_content = u'' + download[ 'raw_content' ]

#with io.open( '/tmp/' + str(downloads_id) + '.html' , 'w', encoding='utf8' ) as f:
#    f.write( raw_content )

get_annotated_text( u''+ raw_content, annotation )    
htmlparser = etree.HTMLParser()
root = etree.fromstring( raw_content, htmlparser )
d = root.xpath( annotation['start_xpath'] )[0]

p = d.getparent()


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-50-8482720fc38d> in <module>()
     21 #    f.write( raw_content )
     22 
---> 23 get_annotated_text( u''+ raw_content, annotation )
     24 htmlparser = etree.HTMLParser()
     25 root = etree.fromstring( raw_content, htmlparser )

<ipython-input-49-ef501dec1bba> in get_annotated_text(raw_content, annotation)
    115                 middle_contents.append( el )
    116                 el = el.getnext()
--> 117                 assert el != None
    118 
    119             print 'end inner loop'

AssertionError: 
sc القبض على 3 إخوان هاجموا مؤتمرًا لدعم الدستور بالحوامدية 
ec <Element div at 0x7ff5a88bcdc0>
common
end inner loop
end loop
<Element div at 0x7ff5a187a460>
[<Element div at 0x7ff5a187a460>, <Element div at 0x7ff5a187adc0>, <Element div at 0x7ff5a187a140>, <Element div at 0x7ff5a187ac80>, <Element body at 0x7ff5a187a5f0>, <Element html at 0x7ff5a88bce10>]
False
commonacccestor <Element div at 0x7ff5a187a460>
<div class="main_content_ip">&#13;
            <h1 class="article_title">&#1575;&#1604;&#1602;&#1576;&#1590; &#1593;&#1604;&#1609; 3 &#1573;&#1582;&#1608;&#1575;&#1606; &#1607;&#1575;&#1580;&#1605;&#1608;&#1575; &#1605;&#1572;&#1578;&#1605;&#1585;&#1611;&#1575; &#1604;&#1583;&#1593;&#1605; &#1575;&#1604;&#1583;&#1587;&#1578;&#1608;&#1585; &#1576;&#1575;&#1604;&#1581;&#1608;&#1575;&#1605;&#1583;&#1610;&#1577; </h1>&#13;
            <span class="SubTitle"/>&#13;
            <div align="info">&#13;
                    <span class="atr_1">&#1603;&#1578;&#1576; : &#1605;&#1581;&#1605;&#1608;&#1583; &#1575;&#1604;&#1580;&#1575;&#1585;&#1581;&#1609; &#1608;&#1580;&#1610;&#1607;&#1575;&#1606; &#1593;&#1576;&#1583; &#1575;&#1604;&#1593;&#1586;&#1610;&#1586;</span>&#13;
                <span class="atr_2">&#1605;&#1606;&#1584; 22 &#1583;&#1602;&#1610;&#1602;&#1577;</span>&#13;
            </div>&#13;
        <div class="social">&#13;
           <a href="javascript:window.print()" class="print_btn"> &#1591;&#1576;&#1575;&#1593;&#1577;</a>&#13;
         &#13;
            <!-- AddThis Button BEGIN -->&#13;
              <div>&#13;
                <ul class="social_btns">&#13;
                <li><a class="addthis_button_tweet"/></li>&#13;
                <li><a class="addthis_button_google_plusone" g:plusone:size="medium"/></li>&#13;
                <li><a class="addthis_counter addthis_pill_style"/></li>&#13;
                <li><a class="addthis_button_facebook_like" fb:like:layout="button_count"/></li>&#13;
               </ul>&#13;
                </div>               &#13;
                <script type="text/javascript" src="http://s7.addthis.com/js/250/addthis_widget.js#pubid=ra-4fe0971970cac661"/> &#13;
            <!-- AddThis Button END -->            &#13;
            &#13;
              &#13;
        </div>&#13;
        <div class=" main_focus">&#13;
            <!--img src="images/2.jpg"/  or -->&#13;
    <img src="http://media.elwatannews.com/News/Large/95453_660_212.jpg" alt="&#1575;&#1604;&#1604;&#1608;&#1575;&#1569; &#1605;&#1581;&#1605;&#1608;&#1583; &#1601;&#1575;&#1585;&#1608;&#1602;"/>&#13;
    <span class="image_caption">&#1575;&#1604;&#1604;&#1608;&#1575;&#1569; &#1605;&#1581;&#1605;&#1608;&#1583; &#1601;&#1575;&#1585;&#1608;&#1602;</span>&#13;
&#13;
&#13;
        </div>&#13;
        <p>&#1571;&#1604;&#1602;&#1578; &#1602;&#1608;&#1575;&#1578; &#1575;&#1604;&#1571;&#1605;&#1606; &#1576;&#1575;&#1604;&#1580;&#1610;&#1586;&#1577;&#1548; &#1575;&#1604;&#1602;&#1576;&#1590; &#1593;&#1604;&#1609; 3 &#1605;&#1606; &#1593;&#1606;&#1575;&#1589;&#1585; &#1580;&#1605;&#1575;&#1593;&#1577; &#1575;&#1604;&#1573;&#1582;&#1608;&#1575;&#1606; &#1575;&#1604;&#1573;&#1585;&#1607;&#1575;&#1576;&#1610;&#1577; &#1576;&#1575;&#1604;&#1581;&#1608;&#1575;&#1605;&#1583;&#1610;&#1577;&#1548;  &#1603;&#1575;&#1606;&#1608;&#1575;&#1605;&#1588;&#1575;&#1585;&#1603;&#1610;&#1606; &#1601;&#1609; &#1605;&#1587;&#1610;&#1585;&#1577; &#1607;&#1575;&#1580;&#1605;&#1578; &#1605;&#1572;&#1578;&#1605;&#1585;&#1611;&#1575; &#1571;&#1602;&#1575;&#1605;&#1607; &#1575;&#1604;&#1571;&#1607;&#1575;&#1604;&#1609; &#1604;&#1583;&#1593;&#1605; &#1575;&#1604;&#1583;&#1587;&#1578;&#1608;&#1585;. &#1571;&#1601;&#1575;&#1583;&#1578; &#1575;&#1604;&#1578;&#1581;&#1585;&#1610;&#1575;&#1578; &#1575;&#1604;&#1578;&#1609; &#1571;&#1588;&#1585;&#1601; &#1593;&#1604;&#1610;&#1607;&#1575; &#1575;&#1604;&#1604;&#1608;&#1575;&#1569; &#1605;&#1581;&#1605;&#1608;&#1583; &#1601;&#1575;&#1585;&#1608;&#1602; &#1605;&#1583;&#1610;&#1585; &#1575;&#1604;&#1573;&#1583;&#1575;&#1585;&#1577; &#1575;&#1604;&#1593;&#1575;&#1605;&#1577; &#1604;&#1604;&#1605;&#1576;&#1575;&#1581;&#1579; &#1571;&#1606; &#1571;&#1607;&#1575;&#1604;&#1609; &#1575;&#1604;&#1581;&#1608;&#1575;&#1605;&#1583;&#1610;&#1577; &#1571;&#1602;&#1575;&#1605;&#1608;&#1575; &#1605;&#1572;&#1578;&#1605;&#1585;&#1611;&#1575; &#1604;&#1583;&#1593;&#1605; &#1575;&#1604;&#1583;&#1587;&#1578;&#1608;&#1585; &#1576;&#1575;&#1604;&#1602;&#1585;&#1576; &#1605;&#1606; &#1605;&#1589;&#1606;&#1593; &#1575;&#1604;&#1587;&#1603;&#1585; &#1576;&#1575;&#1604;&#1581;&#1608;&#1575;&#1605;&#1583;&#1610;&#1577;&#1548; &#1608;&#1601;&#1608;&#1580;&#1574; &#1575;&#1604;&#1581;&#1590;&#1608;&#1585; &#1576;&#1575;&#1604;&#1593;&#1588;&#1585;&#1575;&#1578; &#1605;&#1606; &#1571;&#1593;&#1590;&#1575;&#1569; &#1580;&#1605;&#1575;&#1593;&#1577; &#1575;&#1604;&#1573;&#1582;&#1608;&#1575;&#1606; &#1610;&#1607;&#1575;&#1580;&#1605;&#1608;&#1606; &#1575;&#1604;&#1605;&#1572;&#1578;&#1605;&#1585; &#1576;&#1575;&#1604;&#1591;&#1608;&#1576; &#1608;&#1575;&#1604;&#1586;&#1580;&#1575;&#1580;&#1575;&#1578; &#1575;&#1604;&#1601;&#1575;&#1585;&#1594;&#1577;&#1548; &#1608;&#1581;&#1575;&#1608;&#1604;&#1608;&#1575; &#1575;&#1604;&#1607;&#1585;&#1576;&#1548; &#1573;&#1604;&#1575; &#1571;&#1606; &#1602;&#1608;&#1575;&#1578; &#1575;&#1604;&#1588;&#1585;&#1591;&#1577; &#1578;&#1605;&#1603;&#1606;&#1578; &#1605;&#1606; &#1589;&#1583;&#1607;&#1605; &#1607;&#1580;&#1608;&#1605;&#1607;&#1605; &#1548; &#1608;&#1591;&#1575;&#1585;&#1583;&#1578;&#1607;&#1605; &#1608;&#1571;&#1604;&#1602;&#1578; &#1575;&#1604;&#1602;&#1576;&#1590; &#1593;&#1604;&#1609; 3 &#1605;&#1606;&#1607;&#1605;&#1548; &#1608;&#1578;&#1605; &#1578;&#1593;&#1586;&#1610;&#1586; &#1575;&#1604;&#1602;&#1608;&#1575;&#1578; &#1576;&#1578;&#1588;&#1603;&#1610;&#1604;&#1575;&#1578; &#1605;&#1606; &#1575;&#1604;&#1571;&#1605;&#1606; &#1575;&#1604;&#1605;&#1585;&#1603;&#1586;&#1609;&#1548; &#1608;&#1581;&#1615;&#1585;&#1585; &#1605;&#1581;&#1590;&#1585;&#1611;&#1575; &#1576;&#1575;&#1604;&#1608;&#1575;&#1602;&#1593;&#1577; &#1608;&#1571;&#1615;&#1581;&#1610;&#1604; &#1604;&#1604;&#1606;&#1610;&#1575;&#1576;&#1577; . </p>&#13;
       &#13;
        <div class="share">&#13;
          <!-- AddThis Button BEGIN -->&#13;
               <div>&#13;
                <ul class="social_btns">&#13;
                <li><a class="addthis_button_tweet"/></li>&#13;
                <li><a class="addthis_button_google_plusone" g:plusone:size="medium"/></li>&#13;
                <li><a class="addthis_counter addthis_pill_style"/></li>&#13;
                <li><a class="addthis_button_facebook_like" fb:like:layout="button_count"/></li>&#13;
                    </ul>&#13;
                </div>               &#13;
                <script type="text/javascript" src="http://s7.addthis.com/js/250/addthis_widget.js#pubid=ra-4fe0971970cac661"/> &#13;
            <!-- AddThis Button END -->          &#13;
            &#13;
            <!-- Social media div by Shazly -->&#13;
            &#13;
        </div>&#13;
               &#13;
                    <div class="module_a_3">&#13;
                    </div>&#13;
                <iframe src="http://files.elwatannews.com/ad/news/" frameborder="0" width="600" height="150" scrolling="no"/>&#13;
        <!--Comments -->&#13;
        <div class="module_g_1">&#13;
&#13;
<p class="module_title">&#1575;&#1604;&#1578;&#1593;&#1604;&#1610;&#1602;&#1575;&#1578;<a class="CommPolicy" target="_blank" href="/Comment/CommentsRules">&#1587;&#1610;&#1575;&#1587;&#1577; &#1575;&#1604;&#1578;&#1593;&#1604;&#1610;&#1602;&#1575;&#1578;</a></p>&#13;
<div class="CommentsTabs">&#13;
   &#13;
    <a id="btnElwatanComments" href="#" class="ComSite"/>&#13;
     <a id="btnFacebookComments" href="#" class="ComFBDim"/>&#13;
</div>&#13;
&#13;
<div id="divFacebookComments" style="display: none;">&#13;
    <comments href="http://www.elwatannews.com/news/details/393669" num_posts="5" width="591" style="margin-right:10px;"/>&#13;
</div>&#13;
&#13;
<div id="divElwatanComments">&#13;
&#13;
    <div class="CommentsNoList">&#1604;&#1575; &#1610;&#1608;&#1580;&#1583; &#1578;&#1593;&#1604;&#1610;&#1602;&#1575;&#1578;</div>&#13;
&#13;
    <div class="form_holder">&#13;
<script type="text/javascript">&#13;
    function isValidEmailAddress(emailAddress) {&#13;
        var pattern = new RegExp(/^[+a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}$/i);&#13;
        return pattern.test(emailAddress);&#13;
    }&#13;
&#13;
    function addCommentByAjax(ParenttypeIDVal, ParentItemIDVal) &#13;
    {&#13;
        var userName = $("#txtUserName").val();&#13;
        var userEmail = $("#txtUserEmail").val();&#13;
        var userCommentTitle = $("#txtCommentTitle").val();&#13;
        var userCommentBody = $("#txtCommentBody").val();&#13;
        if (!$.trim($("#txtUserName").val())) &#13;
            {&#13;
                $("#txtUserNameMessage").text("&#1605;&#1606; &#1601;&#1590;&#1604;&#1603; &#1575;&#1583;&#1582;&#1604; &#1575;&#1604;&#1575;&#1587;&#1605;");&#13;
                return false;&#13;
            }&#13;
        else &#13;
            {&#13;
                $("p#txtUserNameMessage").hide();&#13;
            }&#13;
&#13;
        if (!$.trim($("#txtUserEmail").val())) &#13;
            {&#13;
                $("#txtUserEmailMessage").text("&#1605;&#1606; &#1601;&#1590;&#1604;&#1603; &#1575;&#1583;&#1582;&#1604; &#1575;&#1604;&#1575;&#1610;&#1605;&#1610;&#1604;");&#13;
                return false;&#13;
            }&#13;
         else &#13;
            {&#13;
                if (isValidEmailAddress($("#txtUserEmail").val()) == false) &#13;
                {&#13;
                    $("#txtUserEmailMessage").text("&#1605;&#1606; &#1601;&#1590;&#1604;&#1603; &#1575;&#1583;&#1582;&#1604; &#1575;&#1610;&#1605;&#1610;&#1604; &#1589;&#1581;&#1610;&#1581;");&#13;
                    return false;&#13;
                }&#13;
                else &#13;
                {&#13;
                    $("p#txtUserEmailMessage").hide();&#13;
                }&#13;
            }&#13;
&#13;
        if (!$.trim($("#txtCommentBody").val())) &#13;
            {&#13;
                $("#txtCommentBodyMessage").text("&#1605;&#1606; &#1601;&#1590;&#1604;&#1603; &#1575;&#1583;&#1582;&#1604; &#1575;&#1604;&#1578;&#1593;&#1604;&#1610;&#1602;");&#13;
                return false;&#13;
            }&#13;
        else &#13;
            {&#13;
                $("p#txtCommentBodyMessage").hide();&#13;
            }&#13;
        &#13;
        //$('div#PostContainer').hide();&#13;
        &#13;
        $.post("/comment/addcomment", { "itemID": ParentItemIDVal, "itemType": ParenttypeIDVal, "userName": userName, "userEmail": userEmail, "commentBody": userCommentBody },&#13;
                    function (data) {&#13;
                        $("#txtUserName").val('');&#13;
                        $("#txtUserEmail").val('');&#13;
                        $("#txtCommentBody").val('');&#13;
                        $("#notifyUser").show();&#13;
                        setTimeout(function () {&#13;
                            $("#notifyUser").fadeOut('slow');&#13;
                        }, 2000);&#13;
                    });&#13;
      }&#13;
</script>&#13;
<form action="/news/details/393669" method="post">        <div class="Addcommenttit">&#1575;&#1590;&#1601; &#1578;&#1593;&#1604;&#1610;&#1602;</div>&#13;
    <div id="AddCommentContainer">&#13;
        <div id="PostContainer">&#13;
            <p>&#13;
                <input type="text" value="" name="txtUserName" id="txtUserName" placeholder="&#1575;&#1604;&#1573;&#1587;&#1605;*" required="required"/>&#13;
                </p><p id="txtUserNameMessage" class="alert"/>&#13;
            &#13;
            <p>&#13;
                <input type="email" value="" name="txtUserEmail" id="txtUserEmail" placeholder="&#1575;&#1604;&#1576;&#1585;&#1610;&#1583; &#1575;&#1604;&#1573;&#1604;&#1603;&#1578;&#1585;&#1608;&#1606;&#1610;*" required="required"/>&#13;
                </p><p id="txtUserEmailMessage" class="alert"/>&#13;
            &#13;
            &#13;
            <p>&#13;
                <textarea name="txtCommentBody" id="txtCommentBody" cols="" rows="" required="required" placeholder="&#1606;&#1589; &#1575;&#1604;&#1578;&#1593;&#1604;&#1610;&#1602;*"/>&#13;
                    </p><p id="txtCommentBodyMessage" class="alert"/>&#13;
            &#13;
            <div id="notifyUser" class="CommentsNoList" style="display: none">&#13;
                &#1578;&#1605; &#1573;&#1590;&#1575;&#1601;&#1577; &#1578;&#1593;&#1604;&#1610;&#1602;&#1603; &#1576;&#1606;&#1580;&#1575;&#1581; &#1608;&#1587;&#1610;&#1578;&#1605; &#1605;&#1585;&#1575;&#1580;&#1593;&#1578;&#1577; &#1576;&#1608;&#1575;&#1587;&#1591;&#1577; &#1573;&#1583;&#1575;&#1585;&#1577; &#1575;&#1604;&#1605;&#1608;&#1602;&#1593;&#13;
            </div>&#13;
            <div class="btns_holder">&#13;
                <input type="button" id="" class="btn_1" value="&#1571;&#1590;&#1601; &#1578;&#1593;&#1604;&#1610;&#1602;" onclick="addCommentByAjax(1,393669)"/>&#13;
            </div>&#13;
        </div>&#13;
    </div>&#13;
</form>    </div>&#13;
</div>&#13;
&#13;
<!--Facebook Initialization code has been moved to _Header View -->        </div>&#13;
    </div>&#13;
    
ca's processed children
processed_child <Element h1 at 0x7ff5a88bce60>
<h1 class="article_title">&#1575;&#1604;&#1602;&#1576;&#1590; &#1593;&#1604;&#1609; 3 &#1573;&#1582;&#1608;&#1575;&#1606; &#1607;&#1575;&#1580;&#1605;&#1608;&#1575; &#1605;&#1572;&#1578;&#1605;&#1585;&#1611;&#1575; &#1604;&#1583;&#1593;&#1605; &#1575;&#1604;&#1583;&#1587;&#1578;&#1608;&#1585; &#1576;&#1575;&#1604;&#1581;&#1608;&#1575;&#1605;&#1583;&#1610;&#1577; </h1>&#13;
            
start True
outer loop: <Element span at 0x7ff5a19e94b0> 
<span class="SubTitle"/>&#13;
            
inner loop
<Element span at 0x7ff5a19e94b0>
<span class="SubTitle"/>&#13;
            
inner loop
<Element div at 0x7ff5a19e9910>
<div align="info">&#13;
                    <span class="atr_1">&#1603;&#1578;&#1576; : &#1605;&#1581;&#1605;&#1608;&#1583; &#1575;&#1604;&#1580;&#1575;&#1585;&#1581;&#1609; &#1608;&#1580;&#1610;&#1607;&#1575;&#1606; &#1593;&#1576;&#1583; &#1575;&#1604;&#1593;&#1586;&#1610;&#1586;</span>&#13;
                <span class="atr_2">&#1605;&#1606;&#1584; 22 &#1583;&#1602;&#1610;&#1602;&#1577;</span>&#13;
            </div>&#13;
        
end inner loop
<Element div at 0x7ff5a88bcdc0>
<div class="social">&#13;
           <a href="javascript:window.print()" class="print_btn"> &#1591;&#1576;&#1575;&#1593;&#1577;</a>&#13;
         &#13;
            <!-- AddThis Button BEGIN -->&#13;
              <div>&#13;
                <ul class="social_btns">&#13;
                <li><a class="addthis_button_tweet"/></li>&#13;
                <li><a class="addthis_button_google_plusone" g:plusone:size="medium"/></li>&#13;
                <li><a class="addthis_counter addthis_pill_style"/></li>&#13;
                <li><a class="addthis_button_facebook_like" fb:like:layout="button_count"/></li>&#13;
               </ul>&#13;
                </div>               &#13;
                <script type="text/javascript" src="http://s7.addthis.com/js/250/addthis_widget.js#pubid=ra-4fe0971970cac661"/> &#13;
            <!-- AddThis Button END -->            &#13;
            &#13;
              &#13;
        </div>&#13;
        
outer loop: <Element a at 0x7ff5a19e9960> 
<a href="javascript:window.print()" class="print_btn"> &#1591;&#1576;&#1575;&#1593;&#1577;</a>&#13;
         &#13;
            
inner loop
<Element a at 0x7ff5a19e9960>
<a href="javascript:window.print()" class="print_btn"> &#1591;&#1576;&#1575;&#1593;&#1577;</a>&#13;
         &#13;
            
inner loop
<!-- AddThis Button BEGIN -->
<!-- AddThis Button BEGIN -->&#13;
              
inner loop
<Element div at 0x7ff5a19e9500>
<div>&#13;
                <ul class="social_btns">&#13;
                <li><a class="addthis_button_tweet"/></li>&#13;
                <li><a class="addthis_button_google_plusone" g:plusone:size="medium"/></li>&#13;
                <li><a class="addthis_counter addthis_pill_style"/></li>&#13;
                <li><a class="addthis_button_facebook_like" fb:like:layout="button_count"/></li>&#13;
               </ul>&#13;
                </div>               &#13;
                
inner loop
<Element script at 0x7ff5a19e9aa0>
<script type="text/javascript" src="http://s7.addthis.com/js/250/addthis_widget.js#pubid=ra-4fe0971970cac661"/> &#13;
            
inner loop
<!-- AddThis Button END -->
<!-- AddThis Button END -->            &#13;
            &#13;
              &#13;
        

In [20]:
import sqlite3

db = sqlite3.connect('extractor_train_dbs/dev_2014-11-03T09_06_11-0500.db')
db.row_factory = sqlite3.Row

cursor = db.cursor()

cursor.execute( "SELECT * from dlannotations where downloads_id =  582817308" )

row = cursor.fetchone()
print row
d = dict([ (k, row[k]) for k in row.keys() ])
d.keys()
d['annotations_json']
annotations = json.loads( d['annotations_json']  )
annotations
raw_content == d['raw_content']
#get_annotated_text( d['raw_content'], annotations[1] )


<sqlite3.Row object at 0x7ff5ba9d8eb0>
Out[20]:
False

In [21]:
#print 'end el'
#print el
#print etree.tostring( el )


#print start_anncestors
#print end_anncestors

#commonanncestors = list([ s for s in start_anncestors if s in end_anncestors ] )
#commonanncestor = commonanncestors[0]

#commonanncestor

#print commonanncestor
#print 'full dump'

#print etree.tostring( commonanncestor )
#parent = start_container.getparent()

#parent.getnext()

#parent.getnext().index( parent )
#list(parent.itertext() )
#list(parent)
#.index( start_container )

#root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/h1[1]/text()[1]'   )
#root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[1]/text()[1]' )
#res = root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[3]/text()[1]' )
#res = root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/span[2]/p[1]/text()[1]' )
#res = root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[15]/text()[2]' )
#len ( res[0] )
#res

In [21]:
import difflib
from IPython.display import HTML

from collections import Counter

def ratcliff_obershelp_compare( actual_lines, expected_lines ):
    
    words_expected = lines_to_comparable_text(expected_lines ).split()
    words_crf      = lines_to_comparable_text(actual_lines ).split()
    
    differ = difflib.Differ( )
    
    #print words_crf[:10]
    #print words_expected[:10]
    list( differ.compare( words_crf , words_expected ) )
    counts = Counter([ d[0] for d in differ.compare( words_expected, words_crf   ) ])
    
    tp = counts[' ']
    fp = counts['+']
    fn = counts['-']
    
    if float(tp+fp) == 0:
        precision = 0.0
    else:
        precision = tp/float(tp+fp)
        
    if float( tp + fn ) == 0:
        recall = 0
    else:
        recall    = tp/float( tp + fn )
    
    if ( precision + recall ) > 0:
        f1 = 2*(precision*recall)/( precision + recall )
    else:
        f1 = 0
    
    ret = { 'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    return ret

#ratcliff_obershelp_compare( words_crf, words_expected )

In [22]:
#downloads_id
#story
#raw_content
#expected_lines
#preprocessed_lines

def create_extractor_training_object( downloads_id, expected_lines=None ):
    download = get_download( downloads_id )
    
    raw_content = download[u'raw_content']
    stories_id = download[u'stories_id']
    
    print download['url']
    
    story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
    
    story = story.json()[0]
    
    story_lines = get_story_lines( raw_content )
    #print story_lines.content
    preprocessed_lines = story_lines.json()
    
    if not expected_lines:
        expected_lines = get_extractor_training_text( downloads_id, preprocessed_lines  )

    ret = { 'downloads_id': downloads_id,
           'raw_content': raw_content,
           'story': story,
           'preprocessed_lines': preprocessed_lines,
           'expected_lines': expected_lines
           }
    
    return ret

In [23]:
def compare_extractors_for_download( downloads_id ):
    
    eto = create_extractor_training_object( downloads_id )
    
    return comp_extractors( eto )

    download = get_download( downloads_id )
    
    raw_content = download[u'raw_content']
    stories_id = download[u'stories_id']
     
    story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
    
    story = story.json()[0]
    
    story_lines = get_story_lines( raw_content )
    #print story_lines.content
    preprocessed_lines = story_lines.json()
    
    expected_lines = get_extractor_training_text( downloads_id, preprocessed_lines  )
    
    
    
def comp_extractors( eto ):    
    downloads_id = eto['downloads_id']
    story = eto['story']
    raw_content = eto['raw_content']
    preprocessed_lines = eto['preprocessed_lines']
    expected_lines = eto['expected_lines']
        
    title = story[u'title']
    description = story[u'description']
    url = story[u'url']
    
    heur_extract = extract_story( preprocessed_lines, title, description, 'HeuristicExtractor')
    crf_extract = extract_story( preprocessed_lines, title, description, 'CrfExtractor')
    
    heur_lines = get_extracted_text( heur_extract )
    crf_lines  = get_extracted_text( crf_extract )
    
    python_readability_lines = extract_with_python_readability( raw_content )
    
    py_goose_lines = extract_with_python_goose( raw_content )
    justext_lines  = extract_with_justext( raw_content )
    global glob_expected_lines
    global glob_crf_lines
    glob_expected_lines = expected_lines
    glob_crf_lines      = crf_lines
    
    #tree = html.fromstring( extract_with_boilerpipe( raw_content) )   
    #spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
    #boiler_pipe_lines = [ s.text for s in spans ]

    #print "expected_lines:"
    #print lines_to_comparable_text(expected_lines)
    
    #print "boilerpipe lines"
    #print lines_to_comparable_text(boiler_pipe_lines)
    
    comp_results = {}
    comp_results['heur'] = ratcliff_obershelp_compare( heur_lines, expected_lines )
    comp_results['crf']  = ratcliff_obershelp_compare( crf_lines, expected_lines )
    #comp_results['boiler_pipe'] = ratcliff_obershelp_compare( boiler_pipe_lines, expected_lines )
    comp_results['python_readibilty'] = ratcliff_obershelp_compare( python_readability_lines, expected_lines )
    comp_results['py_goose'] = ratcliff_obershelp_compare( py_goose_lines, expected_lines )
    comp_results['justext']  = ratcliff_obershelp_compare( justext_lines, expected_lines )
    
    comp_results['downloads_id'] = downloads_id
    
    #comp_results['expected']    = compare_accuracy( expected_lines, expected_lines )
    return comp_results

In [25]:
#comps_expected = comps

In [71]:
downloads_id = 416655019
downloads_ids = [391881020,401370599,412896439,412952145,412977048,413024519,413657081,413835576,414040102,414257623,
                 414377428,414480464,414818749,414983458,415185946,415186582,415197547,415424551,415978069,416026460,
                 416026587,416047494,416047513,416210404,416263840,416306952,416426245,416655019,416730837,416802690,
                 417347290,417347524,417368539,417389613,417477837,417653177,418489742,418544762,418574641,418648698,
                 418661859,419404469,419440474,419483895,419873979,420430754,420599387,420666122,421520860,421834553,
                 422181106,422280595,422910963,423318170,424080271,424369085,424796346,424840366,425206279,426405203,
                 426560018,426632784,426709900,428449440,429607289,430363249,430995428,433457459,435624796,435659593,461175103,461175549,461176415,461176844,461177487,461178557,461178590,461179203,461179222,461179441,461179762,461179818,461179954,461179956,461180307,461181039,461181597,461186137,461186258,461186833,461187188,461187261,461187577,461188549,461189069,461190586,461193383]

print len( downloads_ids )

comps = []

extractor_training_objects = []
for downloads_id in downloads_ids[:10]:
    print 'downloads_id:', downloads_id
    extractor_training_objects.append( create_extractor_training_object( downloads_id ) )


97
downloads_id: 391881020
[30, 31]
downloads_id: 401370599
[202, 205, 212, 213]
downloads_id: 412896439
[265, 266, 267, 279, 281, 283, 285, 287, 289, 291, 293, 303, 305, 307, 309, 311, 313, 315, 702]
downloads_id: 412952145
[415, 422, 436, 440, 444, 448, 449, 450, 451, 455, 459]
downloads_id: 412977048
[471, 474, 505, 507]
downloads_id: 413024519
[606, 607, 608, 692, 694, 695, 696, 697]
downloads_id: 413657081
[597, 608, 613, 616, 624, 627, 630, 640, 644, 647, 650, 653, 656, 659, 662, 665]
downloads_id: 413835576
[283, 284, 304, 305, 306]
downloads_id: 414040102
[3270, 3273, 3377, 3439, 3444, 3448, 3450, 3452, 3454, 3456, 3458, 3460, 3462, 3464, 3465, 3467]
downloads_id: 414257623
[566, 569, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588]

In [128]:
for extractor_training_object in extractor_training_objects:
    res = comp_extractors( extractor_training_object )
    #print res
    comps.append( res )
    

#print 'Comps_expteced', comps_expected
#print 'comps', comps
#comps == comps_expected


[513, 606, 607, 608, 609, 616, 617, 618, 619, 620, 621, 622, 629, 630, 631, 632]
[577]
original article tmp file  /tmp/tmpMOST9p.html
/tmp/tmp49WYKv.html
/tmp/tmpMOST9p.html
/tmp/tmpMOST9p.html
Now open file:///tmp/tmp49WYKv.html in your web browser

[501, 610, 611, 613, 614, 615]
[563, 610, 611, 613, 614, 615, 617, 618]
original article tmp file  /tmp/tmpEB9b7u.html
/tmp/tmpckfQ5c.html
/tmp/tmpEB9b7u.html
/tmp/tmpEB9b7u.html
Now open file:///tmp/tmpckfQ5c.html in your web browser

[302, 324, 328, 332, 336, 344, 348, 352, 356, 410, 469]
[302, 321, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360]
original article tmp file  /tmp/tmp5Vok6c.html
/tmp/tmprzaVe_.html
/tmp/tmp5Vok6c.html
/tmp/tmp5Vok6c.html
Now open file:///tmp/tmprzaVe_.html in your web browser

[1166, 1264]
[418, 1161, 1166, 1264]
original article tmp file  /tmp/tmpcvIiS_.html
/tmp/tmpeiI_ab.html
/tmp/tmpcvIiS_.html
/tmp/tmpcvIiS_.html
Now open file:///tmp/tmpeiI_ab.html in your web browser

[699, 733, 734, 735, 736, 737, 738, 821, 907, 1019, 1034, 1049, 1064, 1137, 1143, 1149, 1155, 1161, 1167, 1173, 1179, 1185, 1191]
[733, 734, 735, 736, 737, 738]
original article tmp file  /tmp/tmp1sLuyO.html
/tmp/tmpAMCIJ8.html
/tmp/tmp1sLuyO.html
/tmp/tmp1sLuyO.html
Now open file:///tmp/tmpAMCIJ8.html in your web browser

[819, 867, 868, 869, 870, 871, 1483, 1485, 1495, 1507, 1519, 1531, 1543, 1555, 1557, 1567, 1579, 1591, 1606, 1618, 1630, 1650, 1652, 1662, 1674, 1686, 1700, 1712, 1724, 1726, 1739, 1741, 1751, 1753, 1765, 1777, 1779, 1789, 1801, 1813, 1825, 1848, 1860, 1872, 1884, 1904, 1916, 1928, 1930, 1940, 1952, 1964, 1976, 1978, 1988, 1990, 2000, 2012, 2024, 2026, 2036, 2048, 2060, 2072, 2084, 2096, 2108, 2120, 2143, 2145, 2163, 2179, 2192, 2194, 2208, 2224, 2236, 2238, 2248, 2250, 2264, 2278, 2290, 2292, 2302, 2304, 2314, 2335, 2347, 2359, 2371, 2373, 2383, 2395, 2409, 2421, 2433, 2445, 2447, 2457, 2459, 2469, 2481, 2493, 2505, 2517, 2519, 2529, 2541, 2553, 2565, 2577, 2589, 2601, 2613, 2625, 2637, 2639, 2649, 2661, 2673, 2685, 2697, 2709, 2729, 2741, 2743, 2753, 2765, 2777, 2779, 2789, 2801, 2822, 2834, 2846, 2858, 2870, 2882, 2894, 2906, 2918, 2930, 2932, 2942, 2954, 2966, 2986, 2998, 3010, 3022, 3034, 3036, 3046, 3058, 3070, 3082, 3094, 3106, 3108, 3118, 3130, 3142, 3154, 3166, 3212, 3214, 3224, 3226, 3236, 3248, 3260, 3272, 3284, 3286, 3296, 3308, 3310, 3320, 3322, 3332, 3344, 3356, 3358, 3368, 3380, 3392, 3394, 3404, 3406, 3416, 3436, 3448, 3460, 3462, 3472, 3474, 3484, 3496, 3508, 3520, 3522, 3532, 3544, 3546, 3556, 3568, 3580, 3592, 3604, 3616, 3618, 3628, 3640, 3652, 3664, 3676, 3688, 3700, 3712, 3714, 3724, 3726, 3736, 3748, 3760, 3772, 3784, 3786, 3796, 3808, 3810, 3820, 3832, 3844, 3856, 3868, 3870, 3880, 3892, 3904, 3906, 3916, 3928, 3930, 3940, 3942, 3952, 3964, 3976, 3978, 3988, 4000, 4012, 4024, 4036, 4048, 4060, 4072, 4074, 4084, 4096, 4108, 4120, 4132, 4144, 4146, 4156, 4158, 4168, 4170, 4180, 4192, 4204, 4216, 4228, 4240, 4252, 4254, 4264, 4266, 4276, 4288, 4300, 4302, 4312, 4324, 4336, 4338, 4348, 4350, 4360, 4372, 4384, 4386, 4396, 4408, 4420, 4432, 4444, 4456, 4468, 4480, 4492, 4504, 4516, 4528, 4530, 4540, 4552, 4573, 4585, 4597, 4609, 4621, 4633, 4635, 4645, 4657, 4669, 4681, 4683, 4693, 4695, 4705, 4717, 4729, 4741, 4743, 4753, 4755, 4765, 4777, 4779, 4789, 4791, 4801, 4813, 4815, 4825, 4837, 4849, 4861, 4873, 4885, 4897, 4909, 4921, 4933, 4945, 4957, 4969, 4981, 5213, 5219, 5225, 5231, 5237, 5243, 5249, 5255, 5261, 5267, 5273, 5279, 5285, 5291, 5297, 5303, 5309, 5315, 5321, 5327, 5333, 5339, 5345, 5351, 5357, 5363, 5369, 5375, 5381, 5505, 5554, 5773, 5774]
[648, 654, 660, 666, 672, 677, 682, 687, 689, 694, 698, 704, 710, 716, 722, 728, 734, 740, 746, 752, 758, 764, 770, 776, 782, 788, 794, 819, 820, 822, 823, 863, 866, 867, 868, 869, 870, 871, 2421, 2423, 2433, 2435, 2445, 2447, 2457, 2459, 2469, 2471, 2481, 2483, 2493, 2505, 2507, 2517, 2519, 2529, 2531, 2541, 2553, 2555, 2565, 2567, 2577, 2579, 2589, 2591, 2601, 2603, 2613, 2615, 2625, 2627, 2637, 2639, 2649, 2651, 2661, 2663, 2673, 2675, 2685, 2697, 2709, 2711, 2717, 2729, 2731, 2741, 2743, 2753, 2765, 2767, 2777, 2779, 2789, 2791, 2801, 2809, 2822, 2834, 2836, 2846, 2848, 2858, 2860, 2870, 2882, 2884, 2894, 2906, 2908, 2918, 2920, 2930, 2932, 2942, 2954, 2966, 2968, 2974, 2986, 2988, 2998, 3000, 3010, 3012, 3022, 3024, 3034, 3036, 3046, 3058, 3060, 3070, 3072, 3082, 3084, 3094, 3106, 3108, 3118, 3120, 3130, 3142, 3144, 3154, 3156, 3166, 3168, 3204, 3212, 3214, 3224, 3226, 3236, 3248, 3260, 3272, 3284, 3286, 3296, 3298, 3308, 3310, 3320, 3322, 3332, 3344, 3356, 3358, 3368, 3380, 3382, 3392, 3394, 3404, 3406, 3416, 3424, 3436, 3438, 3448, 3450, 3460, 3462, 3472, 3474, 3484, 3486, 3496, 3498, 3508, 3520, 3522, 3532, 3534, 3544, 3546, 3556, 3568, 3570, 3580, 3592, 3594, 3604, 3616, 3618, 3628, 3640, 3642, 3652, 3654, 3664, 3666, 3676, 3688, 3700, 3702, 3712, 3714, 3724, 3726, 3736, 3738, 3748, 3750, 3760, 3772, 3774, 3784, 3786, 3796, 3808, 3810, 3820, 3832, 3844, 3856, 3858, 3868, 3870, 3880, 3892, 3904, 3906, 3916, 3928, 3930, 3940, 3942, 3952, 3954, 3964, 3966, 3976, 3978, 3988, 3990, 4000, 4002, 4012, 4014, 4024, 4026, 4036, 4048, 4060, 4062, 4072, 4074, 4084, 4096, 4098, 4108, 4120, 4132, 4134, 4144, 4146, 4156, 4158, 4168, 4170, 4180, 4192, 4194, 4204, 4206, 4216, 4218, 4228, 4240, 4242, 4252, 4254, 4264, 4266, 4276, 4278, 4288, 4290, 4300, 4302, 4312, 4314, 4324, 4336, 4338, 4348, 4350, 4360, 4372, 4374, 4384, 4386, 4396, 4408, 4410, 4420, 4422, 4432, 4444, 4456, 4458, 4468, 4470, 4480, 4492, 4504, 4516, 4528, 4530, 4540, 4542, 4552, 4554, 4560, 4573, 4575, 4585, 4587, 4597, 4609, 4621, 4633, 4635, 4645, 4647, 4657, 4659, 4669, 4671, 4681, 4683, 4693, 4695, 4705, 4707, 4717, 4719, 4729, 4741, 4743, 4753, 4755, 4765, 4777, 4779, 4789, 4791, 4801, 4803, 4813, 4815, 4825, 4827, 4837, 4839, 4849, 4851, 4861, 4863, 4873, 4885, 4887, 4897, 4899, 4909, 4921, 4923, 4933, 4935, 4945, 4947, 4957, 4969, 4981, 5107, 5113, 5119, 5125, 5131, 5137, 5143, 5149, 5155, 5161, 5167, 5173, 5179, 5185, 5191, 5197, 5203, 5213, 5219, 5225, 5231, 5237, 5243, 5249, 5255, 5261, 5267, 5273, 5279, 5285, 5291, 5297, 5303, 5309, 5315, 5321, 5327, 5333, 5339, 5345, 5351, 5357, 5363, 5369, 5375, 5381]
original article tmp file  /tmp/tmpCcsGl_.html
/tmp/tmpEuSCZ6.html
/tmp/tmpCcsGl_.html
/tmp/tmpCcsGl_.html
Now open file:///tmp/tmpEuSCZ6.html in your web browser

[322, 326, 328, 330, 332, 443]
[]
original article tmp file  /tmp/tmprfROWD.html
/tmp/tmpPqfOyB.html
/tmp/tmprfROWD.html
/tmp/tmprfROWD.html
Now open file:///tmp/tmpPqfOyB.html in your web browser

[]
[0]
original article tmp file  /tmp/tmpcfOHXO.html
/tmp/tmpv7ImTX.html
/tmp/tmpcfOHXO.html
/tmp/tmpcfOHXO.html
Now open file:///tmp/tmpv7ImTX.html in your web browser

Unexpected error on string '"' <type 'exceptions.KeyboardInterrupt'>
[131, 159]
[131, 159]
original article tmp file  /tmp/tmp5Nqg0m.html
/tmp/tmplrPGiy.html
/tmp/tmp5Nqg0m.html
/tmp/tmp5Nqg0m.html
Now open file:///tmp/tmplrPGiy.html in your web browser

[635, 636]
[]
original article tmp file  /tmp/tmp5mZWDV.html
/tmp/tmpWbdf3o.html
/tmp/tmp5mZWDV.html
/tmp/tmp5mZWDV.html
Now open file:///tmp/tmpWbdf3o.html in your web browser

[229, 254, 255, 256, 264, 265, 266, 268, 270, 272, 274, 275, 276, 278, 280, 284, 286, 289]
[229, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287]
original article tmp file  /tmp/tmpfqIYs5.html
/tmp/tmpj5VCGy.html
/tmp/tmpfqIYs5.html
/tmp/tmpfqIYs5.html
Now open file:///tmp/tmpj5VCGy.html in your web browser

[1166, 1264]
[418, 1161, 1166, 1264]
original article tmp file  /tmp/tmpieNCNR.html
/tmp/tmpCTnUwj.html
/tmp/tmpieNCNR.html
/tmp/tmpieNCNR.html
Now open file:///tmp/tmpCTnUwj.html in your web browser

[1169, 1267]
[418, 1161, 1165, 1167, 1169, 1267]
original article tmp file  /tmp/tmpbrYgrc.html
/tmp/tmpltytab.html
/tmp/tmpbrYgrc.html
/tmp/tmpbrYgrc.html
Now open file:///tmp/tmpltytab.html in your web browser

[1166, 1264]
[418, 1161, 1166, 1264]
original article tmp file  /tmp/tmphrQUdE.html
/tmp/tmpARont6.html
/tmp/tmphrQUdE.html
/tmp/tmphrQUdE.html
Now open file:///tmp/tmpARont6.html in your web browser

[135, 211, 217, 360, 402, 408, 446, 456, 466, 489, 503, 547, 552, 586, 591, 616, 621, 636, 641, 646, 651, 681, 686, 701, 706, 711, 716, 735, 749, 763, 768, 773, 797, 802, 807, 812, 846, 851, 856, 861, 890, 920, 935, 944, 974, 989, 998, 1008, 1022, 1032, 1074, 1079, 1084, 1093, 1103, 1113, 1143, 1158, 1173, 1183, 1226, 1236, 1246, 1251, 1261, 1274, 1279, 1284, 1289, 1294, 1299, 1304, 1309, 1314, 1319, 1324, 1331, 1336, 1341, 1346, 1351, 1356, 1361, 1366, 1371, 1376, 1381, 1423, 1424, 1426, 1427, 1428, 1429, 1430, 1431, 1459, 1467, 1474, 1488, 1502, 1506, 1510, 1514, 1590, 1595, 1600, 1605]
[1424, 1426, 1427, 1428, 1429, 1430, 1431]
original article tmp file  /tmp/tmpQWsTGY.html
/tmp/tmpEBCwDl.html
/tmp/tmpQWsTGY.html
/tmp/tmpQWsTGY.html
Now open file:///tmp/tmpEBCwDl.html in your web browser

[228, 254, 257, 258, 259, 260, 261, 262, 264]
[228, 229, 232, 234, 250, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262]
original article tmp file  /tmp/tmpnBJvzK.html
/tmp/tmpfPHdSa.html
/tmp/tmpnBJvzK.html
/tmp/tmpnBJvzK.html
Now open file:///tmp/tmpfPHdSa.html in your web browser

[254, 255, 256, 257, 258, 259, 260]
[229, 230, 233, 235, 254, 255, 256, 257, 258, 259, 260]
original article tmp file  /tmp/tmpRAOWig.html
/tmp/tmpLxrda5.html
/tmp/tmpRAOWig.html
/tmp/tmpRAOWig.html
Now open file:///tmp/tmpLxrda5.html in your web browser

[96, 118]
[96, 153, 156]
original article tmp file  /tmp/tmpk4e5yo.html
/tmp/tmp7wxXDR.html
/tmp/tmpk4e5yo.html
/tmp/tmpk4e5yo.html
Now open file:///tmp/tmp7wxXDR.html in your web browser

[72, 125]
[72, 111, 122, 125]
original article tmp file  /tmp/tmpCfyTcg.html
/tmp/tmpV0lyur.html
/tmp/tmpCfyTcg.html
/tmp/tmpCfyTcg.html
Now open file:///tmp/tmpV0lyur.html in your web browser

[595, 598, 599, 600, 601]
[549, 595, 596, 597, 598, 599, 600, 601]
original article tmp file  /tmp/tmpZo9zyO.html
/tmp/tmpbg1Sms.html
/tmp/tmpZo9zyO.html
/tmp/tmpZo9zyO.html
Now open file:///tmp/tmpbg1Sms.html in your web browser

[600, 601, 603, 605, 606, 608, 609, 611, 612]
[549, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609]
original article tmp file  /tmp/tmpAMm2MS.html
/tmp/tmpmry04y.html
/tmp/tmpAMm2MS.html
/tmp/tmpAMm2MS.html
Now open file:///tmp/tmpmry04y.html in your web browser

[72, 125]
[72]
original article tmp file  /tmp/tmpUhTSgI.html
/tmp/tmp1FbZZ4.html
/tmp/tmpUhTSgI.html
/tmp/tmpUhTSgI.html
Now open file:///tmp/tmp1FbZZ4.html in your web browser

[252, 253, 254, 255, 256, 257]
[252, 253, 254, 255, 256, 257]
original article tmp file  /tmp/tmpYfb0az.html
/tmp/tmpgmSwEE.html
/tmp/tmpYfb0az.html
/tmp/tmpYfb0az.html
Now open file:///tmp/tmpgmSwEE.html in your web browser

[250, 251, 252, 253, 254, 255, 308, 309]
[227, 228, 231, 233, 250, 251, 252, 253, 254, 255]
original article tmp file  /tmp/tmp_py675.html
/tmp/tmpffoZ9u.html
/tmp/tmp_py675.html
/tmp/tmp_py675.html
Now open file:///tmp/tmpffoZ9u.html in your web browser

[100, 102, 104, 106, 108, 110, 112, 114, 116, 118]
[89, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118]
original article tmp file  /tmp/tmpaKWHBV.html
/tmp/tmppD8YsW.html
/tmp/tmpaKWHBV.html
/tmp/tmpaKWHBV.html
Now open file:///tmp/tmppD8YsW.html in your web browser


In [30]:
download = get_download( 391881020 )
download['raw_content']
download['url']
#None


Out[30]:
u'http://virgula.uol.com.br/inacreditavel/bizarro/metro-tem-confusao-por-causa-de-encontro-em-shopping'

In [31]:
print lines_to_comparable_text( extract_with_python_readability( download['raw_content' ] ) )


Metrô tem confusão por causa de encontro em shopping; veja vídeo Era para ser um sábado muvucado como todos aqueles que antecedem o Natal em mais um shopping de São Paulo. Mas foi muito mais do que isso. Milhares de jovens compareceram a um encontro promovido por uma página no Facebook para este sábado (7), no segundo andar do Shopping Metrô Itaquera, às 17h. Mas o que aconteceu foi uma bagunça total, com a chegada da polícia e o fechamento do shopping duas horas antes que o previsto.Ninguém sabe ao certo o que aconteceu, alguns dizem que foi arrastão, mas os próprios lojistas negam. De acordo com um rapaz que registrou a movimentação na saída do metrô, algumas pessoas isoladamente tiveram itens furtados. Infelizmente, o celular do garoto era meio ruim, mas deu para ver a confusão. Veja aqui o vídeo que ele fez.Como muita gente compareceu, houve a confusão. O certo é que já está marcado o terceiro encontro, que é mais uma badalação entre a galera, ou melhor, como diz a própria página, é só um Rolêzinho Parte 3. Tudo começou na semana passado, no domingo (1), com o Vuuk no Shopping Itaquera.

In [32]:
import readability

doc = readability.Document( download['raw_content'] )

doc.content()

print doc.short_title()
print lines_to_comparable_text( [ doc.summary() ] )


Metrô tem confusão por causa de encontro em shopping; veja vídeo
Era para ser um sábado muvucado como todos aqueles que antecedem o Natal em mais um shopping de São Paulo. Mas foi muito mais do que isso. Milhares de jovens compareceram a um encontro promovido por uma página no Facebook para este sábado (7), no segundo andar do Shopping Metrô Itaquera, às 17h. Mas o que aconteceu foi uma bagunça total, com a chegada da polícia e o fechamento do shopping duas horas antes que o previsto.Ninguém sabe ao certo o que aconteceu, alguns dizem que foi arrastão, mas os próprios lojistas negam. De acordo com um rapaz que registrou a movimentação na saída do metrô, algumas pessoas isoladamente tiveram itens furtados. Infelizmente, o celular do garoto era meio ruim, mas deu para ver a confusão. Veja aqui o vídeo que ele fez.Como muita gente compareceu, houve a confusão. O certo é que já está marcado o terceiro encontro, que é mais uma badalação entre a galera, ou melhor, como diz a própria página, é só um Rolêzinho Parte 3. Tudo começou na semana passado, no domingo (1), com o Vuuk no Shopping Itaquera.
/usr/local/lib/python2.7/dist-packages/readability/htmls.py:107: FutureWarning: The behavior of this method will change in future versions. Use specific 'len(elem)' or 'elem is not None' test instead.
  raw_html = unicode(tostring(doc.body or doc))

In [25]:
extractor_training_objects = []

In [24]:
#raw_content

#annotations

def get_element_anncestor_indexes( anchor_el ):
    element_indexes = []
    
    if anchor_el.is_text:
        element_indexes.append( 0 )
    elif anchor.is_tail:
        element_indexes.append( 1 )
    else:
        assert False
    
    element = anchor_el.getparent()
    
    while element.getparent() is not None:
        
        #print 'el', element
        element_index = element.getparent().index( element )
        #print 'index', element.getparent().index( element )
        element_indexes.append( element_index )
        element = element.getparent()
        
    #print element_indexes
    element_indexes.reverse()
    
    return element_indexes

def remove_last_div( xpath ):
    if '/div' not in xpath:
        print 'no div in xpath', xpath
        return xpath
    else:
        i = xpath.rfind( '/div' )
        e = xpath.find('/', i + 1 )
        return xpath[:i] + xpath[e:]

def get_element_from_xpath( root, xpath ):
    if len(root.xpath( xpath )) > 0:
        return root.xpath( xpath )[0]
    elif len(root.xpath( remove_last_div( xpath )) ) > 0:
        return root.xpath( remove_last_div( xpath ))[0]
    
    else:
        
        while len(root.xpath( xpath )) == 0:
            pass
            assert '/' in xpath[2:]
            
            xpath = '/' + xpath[ xpath.find( '/', 2 ): ]
        
        return root.xpath( xpath )[0]
    
def sort_annotations( annotations, raw_content ):
    
    htmlparser = etree.HTMLParser()
    root = etree.fromstring( raw_content, htmlparser )
    
    #print annotation[ 'start_xpath' ]
    annotation = annotations[0]
    start_container = root.xpath( annotation[ 'start_xpath' ] )[0]
    end_container   = root.xpath( annotation[ 'end_xpath' ] )[0]
    
    get_element_anncestor_indexes( start_container ) 
    #element = start_container.getparent()
    annotations.sort( key= lambda a: get_element_anncestor_indexes( get_element_from_xpath( root, a['start_xpath'] ) ) )
    
    return annotations

f = "/html/div/d"
'/' + f[f.find('/',2):]

#"oood" in f


Out[24]:
'//div/d'

In [40]:
import sqlite3

db = sqlite3.connect('extractor_train_dbs/dev_2014-11-03T09_06_11-0500.db')
db.row_factory = sqlite3.Row

cursor = db.cursor()

cursor.execute( "SELECT * from dlannotations  where last_updated > '2014-10-31 00:00:00.069409' order by downloads_id" )

extractor_training_objects = []

skipped_downloads = 0
for row in list( cursor.fetchall() )[:]:
    row =  dict([ (k, row[k]) for k in row.keys() ])
    
    #print row
    
    row['annotations'] = json.loads( row['annotations_json'] )
    row['raw_content'] = u'' + row['raw_content']
    annotated_text  = []
    
    try:
        annotations = row['annotations']
        download = get_download( row['downloads_id'] )
    
        raw_content = u'' + download[
                                     u'raw_content']

        annotations = sort_annotations( annotations, raw_content )
        for annotation in annotations:            
            annotated_text.extend(get_annotated_text( u''+ raw_content, annotation ))
        
        eto = create_extractor_training_object( row['downloads_id'], expected_lines=annotated_text )
        
        #assert eto['raw_content'] != row['raw_content']
        
        if eto['raw_content'] != row['raw_content']:
            #TODO figure out why these may differ
            pass
        
            #d = difflib.Differ()
            #diff = d.compare(eto['raw_content'].splitlines(1), row['raw_content'].splitlines(1))
            #print '\n'.join(diff)
        
        extractor_training_objects.append( eto )
        
    except:
        print "error"
        print 'downloads_id', row['downloads_id']
        print annotation
        skipped_downloads += 1
        raise

print "skipped", skipped_downloads
print "processed", len(extractor_training_objects)


sc Ростуризм вслед за МИДом посоветовал россиянам не ездить в Бангкок
ec Со своей стороны, российские авиакомпании «Аэрофлот» и «Трансаэро» сообщили, что не намерены отменять рейсы в Бангкок на 13 января (на этот день запланирована акция протеста таиландской оппозиции). Об этом заявили в пресс-службах перевозчиков.
common
end inner loop
end loop
<Element article at 0x7ff5a16c54b0>
[<Element article at 0x7ff5a16c54b0>, <Element div at 0x7ff5a16c53c0>, <Element div at 0x7ff5a16c5820>, <Element div at 0x7ff5a16c5280>, <Element body at 0x7ff5a16c58c0>, <Element html at 0x7ff5a16c5b40>]
False
commonacccestor <Element article at 0x7ff5a16c54b0>
ca's processed children
start True
outer loop: <Element div at 0x7ff5a16c5b90> 
<div class="b-media b-photoreport_inside photoreport"><div class="info">

 <time pubdate="2014-01-10T12:45:52+04:00">10.01.2014, 12:45</time> 










 
|&#160;<span rel="author">&#171;&#1043;&#1072;&#1079;&#1077;&#1090;&#1072;.Ru&#187;</span>
 </div> </div>

inner loop
<Element div at 0x7ff5a16c5b90>
<div class="b-media b-photoreport_inside photoreport"><div class="info">

 <time pubdate="2014-01-10T12:45:52+04:00">10.01.2014, 12:45</time> 










 
|&#160;<span rel="author">&#171;&#1043;&#1072;&#1079;&#1077;&#1090;&#1072;.Ru&#187;</span>
 </div> </div>

inner loop
<Element div at 0x7ff5a16c50f0>
<div style="clear:both"/>

end inner loop
outer loop: <Element p at 0x7ff5a16c59b0> 
<p>&#1060;&#1077;&#1076;&#1077;&#1088;&#1072;&#1083;&#1100;&#1085;&#1086;&#1077; &#1072;&#1075;&#1077;&#1085;&#1090;&#1089;&#1090;&#1074;&#1086; &#1087;&#1086; &#1090;&#1091;&#1088;&#1080;&#1079;&#1084;&#1091; &#1074;&#1089;&#1083;&#1077;&#1076; &#1079;&#1072; &#1052;&#1080;&#1085;&#1080;&#1089;&#1090;&#1077;&#1088;&#1089;&#1090;&#1074;&#1086;&#1084; &#1080;&#1085;&#1086;&#1089;&#1090;&#1088;&#1072;&#1085;&#1085;&#1099;&#1093; &#1076;&#1077;&#1083; &#1056;&#1086;&#1089;&#1089;&#1080;&#1080; &#1087;&#1086;&#1089;&#1086;&#1074;&#1077;&#1090;&#1086;&#1074;&#1072;&#1083;&#1086; &#1075;&#1088;&#1072;&#1078;&#1076;&#1072;&#1085;&#1072;&#1084; &#1089;&#1090;&#1088;&#1072;&#1085;&#1099; <a href="http://www.gazeta.ru/travel/news/2014/01/09/n_5865289.shtml"><b>&#1085;&#1077; &#1077;&#1079;&#1076;&#1080;&#1090;&#1100; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082;</b></a> &#1074; &#1089;&#1074;&#1103;&#1079;&#1080; &#1089; &#1075;&#1086;&#1090;&#1086;&#1074;&#1103;&#1097;&#1080;&#1084;&#1080;&#1089;&#1103; &#1087;&#1088;&#1086;&#1090;&#1077;&#1089;&#1090;&#1072;&#1084;&#1080; &#1090;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1089;&#1082;&#1086;&#1081; &#1086;&#1087;&#1087;&#1086;&#1079;&#1080;&#1094;&#1080;&#1080;. &#1054;&#1073; &#1101;&#1090;&#1086;&#1084; &#1089;&#1086;&#1086;&#1073;&#1097;&#1072;&#1077;&#1090; &#1086;&#1092;&#1080;&#1094;&#1080;&#1072;&#1083;&#1100;&#1085;&#1099;&#1081; <a href="http://www.russiatourism.ru/news/-33562533/" target="_blank"><b>&#1089;&#1072;&#1081;&#1090;</b></a> <a class="tag" href="/tags/rosturizm.shtml">&#1056;&#1086;&#1089;&#1090;&#1091;&#1088;&#1080;&#1079;&#1084;&#1072;</a>.</p>
inner loop
<Element p at 0x7ff5a16c59b0>
<p>&#1060;&#1077;&#1076;&#1077;&#1088;&#1072;&#1083;&#1100;&#1085;&#1086;&#1077; &#1072;&#1075;&#1077;&#1085;&#1090;&#1089;&#1090;&#1074;&#1086; &#1087;&#1086; &#1090;&#1091;&#1088;&#1080;&#1079;&#1084;&#1091; &#1074;&#1089;&#1083;&#1077;&#1076; &#1079;&#1072; &#1052;&#1080;&#1085;&#1080;&#1089;&#1090;&#1077;&#1088;&#1089;&#1090;&#1074;&#1086;&#1084; &#1080;&#1085;&#1086;&#1089;&#1090;&#1088;&#1072;&#1085;&#1085;&#1099;&#1093; &#1076;&#1077;&#1083; &#1056;&#1086;&#1089;&#1089;&#1080;&#1080; &#1087;&#1086;&#1089;&#1086;&#1074;&#1077;&#1090;&#1086;&#1074;&#1072;&#1083;&#1086; &#1075;&#1088;&#1072;&#1078;&#1076;&#1072;&#1085;&#1072;&#1084; &#1089;&#1090;&#1088;&#1072;&#1085;&#1099; <a href="http://www.gazeta.ru/travel/news/2014/01/09/n_5865289.shtml"><b>&#1085;&#1077; &#1077;&#1079;&#1076;&#1080;&#1090;&#1100; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082;</b></a> &#1074; &#1089;&#1074;&#1103;&#1079;&#1080; &#1089; &#1075;&#1086;&#1090;&#1086;&#1074;&#1103;&#1097;&#1080;&#1084;&#1080;&#1089;&#1103; &#1087;&#1088;&#1086;&#1090;&#1077;&#1089;&#1090;&#1072;&#1084;&#1080; &#1090;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1089;&#1082;&#1086;&#1081; &#1086;&#1087;&#1087;&#1086;&#1079;&#1080;&#1094;&#1080;&#1080;. &#1054;&#1073; &#1101;&#1090;&#1086;&#1084; &#1089;&#1086;&#1086;&#1073;&#1097;&#1072;&#1077;&#1090; &#1086;&#1092;&#1080;&#1094;&#1080;&#1072;&#1083;&#1100;&#1085;&#1099;&#1081; <a href="http://www.russiatourism.ru/news/-33562533/" target="_blank"><b>&#1089;&#1072;&#1081;&#1090;</b></a> <a class="tag" href="/tags/rosturizm.shtml">&#1056;&#1086;&#1089;&#1090;&#1091;&#1088;&#1080;&#1079;&#1084;&#1072;</a>.</p>
inner loop
<Element p at 0x7ff5a16c5be0>
<p>&#171;<a class="tag" href="/tags/ministerstvo_inostrannyh_del_rf.shtml">&#1052;&#1048;&#1044; &#1056;&#1086;&#1089;&#1089;&#1080;&#1080;</a> &#1080; &#1056;&#1086;&#1089;&#1090;&#1091;&#1088;&#1080;&#1079;&#1084; &#1085;&#1072;&#1089;&#1090;&#1086;&#1103;&#1090;&#1077;&#1083;&#1100;&#1085;&#1086; &#1088;&#1077;&#1082;&#1086;&#1084;&#1077;&#1085;&#1076;&#1091;&#1102;&#1090; &#1088;&#1086;&#1089;&#1089;&#1080;&#1081;&#1089;&#1082;&#1080;&#1084; &#1075;&#1088;&#1072;&#1078;&#1076;&#1072;&#1085;&#1072;&#1084; &#1074;&#1086;&#1079;&#1076;&#1077;&#1088;&#1078;&#1072;&#1090;&#1100;&#1089;&#1103; &#1086;&#1090; &#1087;&#1086;&#1077;&#1079;&#1076;&#1086;&#1082; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082;, &#1072; &#1087;&#1088;&#1080;&#1085;&#1080;&#1084;&#1072;&#1102;&#1097;&#1080;&#1084; &#1085;&#1072;&#1096;&#1080;&#1093; &#1090;&#1091;&#1088;&#1080;&#1089;&#1090;&#1086;&#1074; &#1082;&#1086;&#1084;&#1087;&#1072;&#1085;&#1080;&#1103;&#1084; &#8211; &#1086;&#1090;&#1084;&#1077;&#1085;&#1080;&#1090;&#1100; &#1101;&#1082;&#1089;&#1082;&#1091;&#1088;&#1089;&#1080;&#1080; &#1074; &#1090;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1089;&#1082;&#1091;&#1102; &#1089;&#1090;&#1086;&#1083;&#1080;&#1094;&#1091; &#1085;&#1072; &#1087;&#1077;&#1088;&#1080;&#1086;&#1076; &#1074;&#1086;&#1079;&#1084;&#1086;&#1078;&#1085;&#1099;&#1093; &#1084;&#1072;&#1089;&#1089;&#1086;&#1074;&#1099;&#1093; &#1074;&#1099;&#1089;&#1090;&#1091;&#1087;&#1083;&#1077;&#1085;&#1080;&#1081; &#1086;&#1087;&#1087;&#1086;&#1079;&#1080;&#1094;&#1080;&#1080;&#187;, &#8211; &#1075;&#1086;&#1074;&#1086;&#1088;&#1080;&#1090;&#1089;&#1103; &#1074; &#1089;&#1086;&#1086;&#1073;&#1097;&#1077;&#1085;&#1080;&#1080;.</p>
inner loop
<Element p at 0x7ff5a16c5a50>
<p>&#1043;&#1083;&#1072;&#1074;&#1072; &#1087;&#1088;&#1077;&#1089;&#1089;-&#1089;&#1083;&#1091;&#1078;&#1073;&#1099; &#1056;&#1086;&#1089;&#1090;&#1091;&#1088;&#1080;&#1079;&#1084;&#1072; &#1048;&#1088;&#1080;&#1085;&#1072; &#1065;&#1077;&#1075;&#1086;&#1083;&#1100;&#1082;&#1086;&#1074;&#1072; &#1086;&#1090;&#1084;&#1077;&#1090;&#1080;&#1083;&#1072;, &#1074; &#1089;&#1074;&#1086;&#1102; &#1086;&#1095;&#1077;&#1088;&#1077;&#1076;&#1100;, &#1095;&#1090;&#1086; &#1074;&#1077;&#1076;&#1086;&#1084;&#1089;&#1090;&#1074;&#1086; &#1085;&#1077; &#1087;&#1086;&#1083;&#1091;&#1095;&#1072;&#1083;&#1086; &#1078;&#1072;&#1083;&#1086;&#1073; &#1086;&#1090; &#1086;&#1090;&#1076;&#1099;&#1093;&#1072;&#1102;&#1097;&#1080;&#1093; &#1074; &#1058;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1077; &#1088;&#1086;&#1089;&#1089;&#1080;&#1103;&#1085;.</p>
inner loop
<Element p at 0x7ff5a16c5500>
<p>&#171;&#1041;&#1099;&#1083;&#1080; &#1079;&#1074;&#1086;&#1085;&#1082;&#1080; &#1086;&#1090; &#1090;&#1091;&#1088;&#1080;&#1089;&#1090;&#1086;&#1074;, &#1082;&#1086;&#1090;&#1086;&#1088;&#1099;&#1077; &#1080;&#1085;&#1090;&#1077;&#1088;&#1077;&#1089;&#1086;&#1074;&#1072;&#1083;&#1080;&#1089;&#1100;, &#1084;&#1086;&#1075;&#1091;&#1090; &#1083;&#1080; &#1086;&#1085;&#1080; &#1086;&#1090;&#1084;&#1077;&#1085;&#1080;&#1090;&#1100; &#1089;&#1074;&#1086;&#1102; &#1087;&#1086;&#1077;&#1079;&#1076;&#1082;&#1091; &#1080; &#1074;&#1077;&#1088;&#1085;&#1091;&#1090;&#1100; &#1076;&#1077;&#1085;&#1100;&#1075;&#1080;, &#1085;&#1086; &#1085;&#1080;&#1082;&#1072;&#1082;&#1080;&#1093; &#1078;&#1072;&#1083;&#1086;&#1073; &#1086;&#1090; &#1090;&#1077;&#1093;, &#1082;&#1090;&#1086; &#1086;&#1090;&#1076;&#1099;&#1093;&#1072;&#1077;&#1090; &#1085;&#1072; &#1082;&#1091;&#1088;&#1086;&#1088;&#1090;&#1072;&#1093; &#1058;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1072;, &#1084;&#1099; &#1085;&#1077; &#1087;&#1086;&#1083;&#1091;&#1095;&#1072;&#1083;&#1080;&#187;, &#8211; &#1086;&#1090;&#1084;&#1077;&#1090;&#1080;&#1083;&#1072; &#1065;&#1077;&#1075;&#1086;&#1083;&#1100;&#1082;&#1086;&#1074;&#1072;, &#1089;&#1083;&#1086;&#1074;&#1072; &#1082;&#1086;&#1090;&#1086;&#1088;&#1086;&#1081; &#1087;&#1088;&#1080;&#1074;&#1086;&#1076;&#1080;&#1090; <a href="http://www.itar-tass.com/" target="_blank"><b>&#1048;&#1058;&#1040;&#1056;-&#1058;&#1040;&#1057;&#1057;</b></a>.</p>
inner loop
<Element p at 0x7ff5a16c5c30>
<p>&#1055;&#1088;&#1077;&#1076;&#1089;&#1090;&#1072;&#1074;&#1080;&#1090;&#1077;&#1083;&#1100; &#1056;&#1086;&#1089;&#1090;&#1091;&#1088;&#1080;&#1079;&#1084;&#1072; &#1087;&#1086;&#1076;&#1095;&#1077;&#1088;&#1082;&#1085;&#1091;&#1083;&#1072;, &#1095;&#1090;&#1086; &#1089;&#1080;&#1090;&#1091;&#1072;&#1094;&#1080;&#1103; &#1074; &#1058;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1077; &#1086;&#1089;&#1090;&#1072;&#1077;&#1090;&#1089;&#1103; &#1089;&#1087;&#1086;&#1082;&#1086;&#1081;&#1085;&#1086;&#1081; &#1080; &#1073;&#1077;&#1079;&#1086;&#1087;&#1072;&#1089;&#1085;&#1086;&#1081;.</p>
end inner loop
found end_contain parent, exiting loop: <Element p at 0x7ff5a16c51e0> 
<p>&#1057;&#1086; &#1089;&#1074;&#1086;&#1077;&#1081; &#1089;&#1090;&#1086;&#1088;&#1086;&#1085;&#1099;, &#1088;&#1086;&#1089;&#1089;&#1080;&#1081;&#1089;&#1082;&#1080;&#1077; &#1072;&#1074;&#1080;&#1072;&#1082;&#1086;&#1084;&#1087;&#1072;&#1085;&#1080;&#1080; &#171;&#1040;&#1101;&#1088;&#1086;&#1092;&#1083;&#1086;&#1090;&#187; &#1080; &#171;&#1058;&#1088;&#1072;&#1085;&#1089;&#1072;&#1101;&#1088;&#1086;&#187; &#1089;&#1086;&#1086;&#1073;&#1097;&#1080;&#1083;&#1080;, &#1095;&#1090;&#1086; &#1085;&#1077; &#1085;&#1072;&#1084;&#1077;&#1088;&#1077;&#1085;&#1099; &#1086;&#1090;&#1084;&#1077;&#1085;&#1103;&#1090;&#1100; &#1088;&#1077;&#1081;&#1089;&#1099; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082; &#1085;&#1072; 13 &#1103;&#1085;&#1074;&#1072;&#1088;&#1103; (&#1085;&#1072; &#1101;&#1090;&#1086;&#1090; &#1076;&#1077;&#1085;&#1100; &#1079;&#1072;&#1087;&#1083;&#1072;&#1085;&#1080;&#1088;&#1086;&#1074;&#1072;&#1085;&#1072; &#1072;&#1082;&#1094;&#1080;&#1103; &#1087;&#1088;&#1086;&#1090;&#1077;&#1089;&#1090;&#1072; &#1090;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1089;&#1082;&#1086;&#1081; &#1086;&#1087;&#1087;&#1086;&#1079;&#1080;&#1094;&#1080;&#1080;). &#1054;&#1073; &#1101;&#1090;&#1086;&#1084; &#1079;&#1072;&#1103;&#1074;&#1080;&#1083;&#1080; &#1074; &#1087;&#1088;&#1077;&#1089;&#1089;-&#1089;&#1083;&#1091;&#1078;&#1073;&#1072;&#1093; &#1087;&#1077;&#1088;&#1077;&#1074;&#1086;&#1079;&#1095;&#1080;&#1082;&#1086;&#1074;.</p>

<p>&#1057;&#1086; &#1089;&#1074;&#1086;&#1077;&#1081; &#1089;&#1090;&#1086;&#1088;&#1086;&#1085;&#1099;, &#1088;&#1086;&#1089;&#1089;&#1080;&#1081;&#1089;&#1082;&#1080;&#1077; &#1072;&#1074;&#1080;&#1072;&#1082;&#1086;&#1084;&#1087;&#1072;&#1085;&#1080;&#1080; &#171;&#1040;&#1101;&#1088;&#1086;&#1092;&#1083;&#1086;&#1090;&#187; &#1080; &#171;&#1058;&#1088;&#1072;&#1085;&#1089;&#1072;&#1101;&#1088;&#1086;&#187; &#1089;&#1086;&#1086;&#1073;&#1097;&#1080;&#1083;&#1080;, &#1095;&#1090;&#1086; &#1085;&#1077; &#1085;&#1072;&#1084;&#1077;&#1088;&#1077;&#1085;&#1099; &#1086;&#1090;&#1084;&#1077;&#1085;&#1103;&#1090;&#1100; &#1088;&#1077;&#1081;&#1089;&#1099; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082; &#1085;&#1072; 13 &#1103;&#1085;&#1074;&#1072;&#1088;&#1103; (&#1085;&#1072; &#1101;&#1090;&#1086;&#1090; &#1076;&#1077;&#1085;&#1100; &#1079;&#1072;&#1087;&#1083;&#1072;&#1085;&#1080;&#1088;&#1086;&#1074;&#1072;&#1085;&#1072; &#1072;&#1082;&#1094;&#1080;&#1103; &#1087;&#1088;&#1086;&#1090;&#1077;&#1089;&#1090;&#1072; &#1090;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1089;&#1082;&#1086;&#1081; &#1086;&#1087;&#1087;&#1086;&#1079;&#1080;&#1094;&#1080;&#1080;). &#1054;&#1073; &#1101;&#1090;&#1086;&#1084; &#1079;&#1072;&#1103;&#1074;&#1080;&#1083;&#1080; &#1074; &#1087;&#1088;&#1077;&#1089;&#1089;-&#1089;&#1083;&#1091;&#1078;&#1073;&#1072;&#1093; &#1087;&#1077;&#1088;&#1077;&#1074;&#1086;&#1079;&#1095;&#1080;&#1082;&#1086;&#1074;.</p>

False
True
escape while
<p>&#1057;&#1086; &#1089;&#1074;&#1086;&#1077;&#1081; &#1089;&#1090;&#1086;&#1088;&#1086;&#1085;&#1099;, &#1088;&#1086;&#1089;&#1089;&#1080;&#1081;&#1089;&#1082;&#1080;&#1077; &#1072;&#1074;&#1080;&#1072;&#1082;&#1086;&#1084;&#1087;&#1072;&#1085;&#1080;&#1080; &#171;&#1040;&#1101;&#1088;&#1086;&#1092;&#1083;&#1086;&#1090;&#187; &#1080; &#171;&#1058;&#1088;&#1072;&#1085;&#1089;&#1072;&#1101;&#1088;&#1086;&#187; &#1089;&#1086;&#1086;&#1073;&#1097;&#1080;&#1083;&#1080;, &#1095;&#1090;&#1086; &#1085;&#1077; &#1085;&#1072;&#1084;&#1077;&#1088;&#1077;&#1085;&#1099; &#1086;&#1090;&#1084;&#1077;&#1085;&#1103;&#1090;&#1100; &#1088;&#1077;&#1081;&#1089;&#1099; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082; &#1085;&#1072; 13 &#1103;&#1085;&#1074;&#1072;&#1088;&#1103; (&#1085;&#1072; &#1101;&#1090;&#1086;&#1090; &#1076;&#1077;&#1085;&#1100; &#1079;&#1072;&#1087;&#1083;&#1072;&#1085;&#1080;&#1088;&#1086;&#1074;&#1072;&#1085;&#1072; &#1072;&#1082;&#1094;&#1080;&#1103; &#1087;&#1088;&#1086;&#1090;&#1077;&#1089;&#1090;&#1072; &#1090;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1089;&#1082;&#1086;&#1081; &#1086;&#1087;&#1087;&#1086;&#1079;&#1080;&#1094;&#1080;&#1080;). &#1054;&#1073; &#1101;&#1090;&#1086;&#1084; &#1079;&#1072;&#1103;&#1074;&#1080;&#1083;&#1080; &#1074; &#1087;&#1088;&#1077;&#1089;&#1089;-&#1089;&#1083;&#1091;&#1078;&#1073;&#1072;&#1093; &#1087;&#1077;&#1088;&#1077;&#1074;&#1086;&#1079;&#1095;&#1080;&#1082;&#1086;&#1074;.</p>

start container
<h1 class="article_news_name">&#1056;&#1086;&#1089;&#1090;&#1091;&#1088;&#1080;&#1079;&#1084; &#1074;&#1089;&#1083;&#1077;&#1076; &#1079;&#1072; &#1052;&#1048;&#1044;&#1086;&#1084; &#1087;&#1086;&#1089;&#1086;&#1074;&#1077;&#1090;&#1086;&#1074;&#1072;&#1083; &#1088;&#1086;&#1089;&#1089;&#1080;&#1103;&#1085;&#1072;&#1084; &#1085;&#1077; &#1077;&#1079;&#1076;&#1080;&#1090;&#1100; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082;</h1>
      

Ростуризм вслед за МИДом посоветовал россиянам не ездить в Бангкок
offset 0
end container
<p>&#1057;&#1086; &#1089;&#1074;&#1086;&#1077;&#1081; &#1089;&#1090;&#1086;&#1088;&#1086;&#1085;&#1099;, &#1088;&#1086;&#1089;&#1089;&#1080;&#1081;&#1089;&#1082;&#1080;&#1077; &#1072;&#1074;&#1080;&#1072;&#1082;&#1086;&#1084;&#1087;&#1072;&#1085;&#1080;&#1080; &#171;&#1040;&#1101;&#1088;&#1086;&#1092;&#1083;&#1086;&#1090;&#187; &#1080; &#171;&#1058;&#1088;&#1072;&#1085;&#1089;&#1072;&#1101;&#1088;&#1086;&#187; &#1089;&#1086;&#1086;&#1073;&#1097;&#1080;&#1083;&#1080;, &#1095;&#1090;&#1086; &#1085;&#1077; &#1085;&#1072;&#1084;&#1077;&#1088;&#1077;&#1085;&#1099; &#1086;&#1090;&#1084;&#1077;&#1085;&#1103;&#1090;&#1100; &#1088;&#1077;&#1081;&#1089;&#1099; &#1074; &#1041;&#1072;&#1085;&#1075;&#1082;&#1086;&#1082; &#1085;&#1072; 13 &#1103;&#1085;&#1074;&#1072;&#1088;&#1103; (&#1085;&#1072; &#1101;&#1090;&#1086;&#1090; &#1076;&#1077;&#1085;&#1100; &#1079;&#1072;&#1087;&#1083;&#1072;&#1085;&#1080;&#1088;&#1086;&#1074;&#1072;&#1085;&#1072; &#1072;&#1082;&#1094;&#1080;&#1103; &#1087;&#1088;&#1086;&#1090;&#1077;&#1089;&#1090;&#1072; &#1090;&#1072;&#1080;&#1083;&#1072;&#1085;&#1076;&#1089;&#1082;&#1086;&#1081; &#1086;&#1087;&#1087;&#1086;&#1079;&#1080;&#1094;&#1080;&#1080;). &#1054;&#1073; &#1101;&#1090;&#1086;&#1084; &#1079;&#1072;&#1103;&#1074;&#1080;&#1083;&#1080; &#1074; &#1087;&#1088;&#1077;&#1089;&#1089;-&#1089;&#1083;&#1091;&#1078;&#1073;&#1072;&#1093; &#1087;&#1077;&#1088;&#1077;&#1074;&#1086;&#1079;&#1095;&#1080;&#1082;&#1086;&#1074;.</p>

Со своей стороны, российские авиакомпании «Аэрофлот» и «Трансаэро» сообщили, что не намерены отменять рейсы в Бангкок на 13 января (на этот день запланирована акция протеста таиландской оппозиции). Об этом заявили в пресс-службах перевозчиков.
offset 243
http://www.gazeta.ru/travel/news/2014/01/10/n_5866529.shtml
sc
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-40-534fb5318250> in <module>()
     29         annotations = sort_annotations( annotations, raw_content )
     30         for annotation in annotations:
---> 31             annotated_text.extend(get_annotated_text( u''+ raw_content, annotation ))
     32 
     33         eto = create_extractor_training_object( row['downloads_id'], expected_lines=annotated_text )

<ipython-input-39-c05667916d6c> in get_annotated_text(raw_content, annotation)
    103                 print "inner loop"
    104                 print el
--> 105                 assert el != None
    106                 print etree.tostring( el )
    107                 middle_contents.append( el )

AssertionError: 
 القبض على 3 إخوان هاجموا مؤتمرًا لدعم الدستور بالحوامدية 
ec <Element div at 0x7ff5a187a1e0>
common
end inner loop
end loop
<Element div at 0x7ff5a187af00>
[<Element div at 0x7ff5a187af00>, <Element div at 0x7ff5a187ad70>, <Element div at 0x7ff5a187acd0>, <Element div at 0x7ff5a187a370>, <Element body at 0x7ff5a187a9b0>, <Element html at 0x7ff5a16c5730>]
False
commonacccestor <Element div at 0x7ff5a187af00>
ca's processed children
start True
outer loop: <Element span at 0x7ff5a187a500> 
<span class="SubTitle"/>&#13;
            
inner loop
<Element span at 0x7ff5a187a500>
<span class="SubTitle"/>&#13;
            
inner loop
<Element div at 0x7ff5a187a460>
<div align="info">&#13;
                    <span class="atr_1">&#1603;&#1578;&#1576; : &#1605;&#1581;&#1605;&#1608;&#1583; &#1575;&#1604;&#1580;&#1575;&#1585;&#1581;&#1609; &#1608;&#1580;&#1610;&#1607;&#1575;&#1606; &#1593;&#1576;&#1583; &#1575;&#1604;&#1593;&#1586;&#1610;&#1586;</span>&#13;
                <span class="atr_2">&#1605;&#1606;&#1584; 22 &#1583;&#1602;&#1610;&#1602;&#1577;</span>&#13;
            </div>&#13;
        
end inner loop
outer loop: <Element a at 0x7ff5a187a550> 
<a href="javascript:window.print()" class="print_btn"> &#1591;&#1576;&#1575;&#1593;&#1577;</a>&#13;
         &#13;
            
inner loop
<Element a at 0x7ff5a187a550>
<a href="javascript:window.print()" class="print_btn"> &#1591;&#1576;&#1575;&#1593;&#1577;</a>&#13;
         &#13;
            
inner loop
<!-- AddThis Button BEGIN -->
<!-- AddThis Button BEGIN -->&#13;
              
inner loop
<Element div at 0x7ff5a187ad20>
<div>&#13;
                <ul class="social_btns">&#13;
                <li><a class="addthis_button_tweet"/></li>&#13;
                <li><a class="addthis_button_google_plusone" g:plusone:size="medium"/></li>&#13;
                <li><a class="addthis_counter addthis_pill_style"/></li>&#13;
                <li><a class="addthis_button_facebook_like" fb:like:layout="button_count"/></li>&#13;
               </ul>&#13;
                </div>               &#13;
                
inner loop
<Element script at 0x7ff5a187ae10>
<script type="text/javascript" src="http://s7.addthis.com/js/250/addthis_widget.js#pubid=ra-4fe0971970cac661"/> &#13;
            
inner loop
<!-- AddThis Button END -->
<!-- AddThis Button END -->            &#13;
            &#13;
              &#13;
        
inner loop
None
error
downloads_id 413070223
{u'end_offset': 0, u'end_xpath': u'/html[1]/body[1]/div[2]/div[3]/div[1]/div[2]/div[2]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[3]/div[1]/div[2]/h1[1]/text()[1]'}

In [192]:
import re 
htmlparser = etree.HTMLParser()
root = etree.fromstring( raw_content, htmlparser )

#print annotation[ 'start_xpath' ]
annotation = annotations[0]
start_container = root.xpath( annotation[ 'start_xpath' ] )[0]
end_container   = root.xpath( annotation[ 'end_xpath' ] )[0]

get_element_anncestor_indexes( start_container ) 
#element = start_container.getparent()
#annotations.sort( key= lambda a: get_element_anncestor_indexes( root.xpath( a['start_xpath'] )[0] ) )

#print row['annotations']

print download['downloads_id']
a = {u'end_offset': 28, u'end_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]'}

sp = '/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]'
s =  '/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]'
remove_last_div( sp )
    

#root.xpath( remove_last_div( sp ) )

#m = re.search( '\/div', sp )

#m.pos

#for a in row['annotations']:
#    print a[ 'start_xpath']
    #print raw_content
#    if len( root.xpath( a[ 'start_xpath' ] ) ) == 0:
#        sp = a['start_xpath']


582815971
Out[192]:
'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/ul[1]/li[2]/time[1]/text()[1]'

In [205]:
print len( extractor_training_objects )

comps_new_downloads = []
for extractor_training_object in extractor_training_objects:
    res = comp_extractors( extractor_training_object )
    #print res
    comps_new_downloads.append( res )

#extractor_training_objects


34

In [206]:
import pandas as pd



new_comps = []
for comp in comps_new_downloads:
    new_comp = {}
    new_comp = { 'downloads_id': comp['downloads_id'] }
    #del comp['downloads_id']
    
    extractor_types = [ k for k in comp.keys() if k != 'downloads_id' ]
    
    for extractor_type in extractor_types:
        new_comp.update([ ( k + '_' + extractor_type , v) for k,v in comp[ extractor_type ].iteritems() ])
        #new_comp[ k + 'boiler_pipe      

    new_comps.append( new_comp )
    
new_comps
df = pd.DataFrame( new_comps )
df.set_index('downloads_id', inplace=True )
df.describe(percentiles=[.5] )
result_types = [ 'precision', 'recall', 'f1' ]
for result_type in result_types:
    res_columns = [ col for col in df.columns if col.startswith( result_type ) ]
    #df.ix[:,['f1_boiler_pipe',	'f1_crf',	'f1_heur', 'f1_python_readibilty']].describe()
    print df.ix[:,res_columns].describe( percentiles=[0.5])
#df.describe()


       precision_crf  precision_heur  precision_justext  precision_py_goose  \
count      34.000000       34.000000                 34           34.000000   
mean        0.729586        0.751802                  0            0.898928   
std         0.363745        0.265162                  0            0.153675   
min         0.000000        0.000000                  0            0.233333   
50%         0.923995        0.856004                  0            0.948430   
max         0.998221        0.998221                  0            1.000000   

       precision_python_readibilty  
count                    34.000000  
mean                      0.881713  
std                       0.162957  
min                       0.218750  
50%                       0.942969  
max                       0.998246  
       recall_crf  recall_heur  recall_justext  recall_py_goose  \
count   34.000000    34.000000              34        34.000000   
mean     0.705665     0.792661               0         0.859228   
std      0.338169     0.221512               0         0.117726   
min      0.000000     0.000000               0         0.566069   
50%      0.878367     0.877363               0         0.910131   
max      0.992126     0.985827               0         0.992126   

       recall_python_readibilty  
count                 34.000000  
mean                   0.879025  
std                    0.137697  
min                    0.378467  
50%                    0.931640  
max                    0.992126  
          f1_crf    f1_heur  f1_justext  f1_py_goose  f1_python_readibilty
count  34.000000  34.000000          34    34.000000             34.000000
mean    0.695130   0.750929           0     0.869037              0.864516
std     0.345572   0.242706           0     0.129721              0.144475
min     0.000000   0.000000           0     0.358974              0.354430
50%     0.887016   0.852689           0     0.928846              0.929799
max     0.988235   0.985051           0     0.994475              0.988235

In [37]:
df.describe()


Out[37]:
f1_boiler_pipe f1_crf f1_heur f1_py_goose f1_python_readibilty precision_boiler_pipe precision_crf precision_heur precision_py_goose precision_python_readibilty recall_boiler_pipe recall_crf recall_heur recall_py_goose recall_python_readibilty
count 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
mean 0.746430 0.606308 0.774217 0.696688 0.965652 0.739779 0.787245 0.710740 0.982540 0.966188 0.775660 0.589271 0.928525 0.628317 0.967279
std 0.284490 0.422257 0.198469 0.324057 0.019898 0.277227 0.363474 0.250716 0.036192 0.036165 0.292057 0.443180 0.040053 0.352459 0.039788
min 0.042053 0.043321 0.307073 0.103261 0.934272 0.026814 0.060606 0.181963 0.901538 0.892601 0.097421 0.022140 0.872690 0.054441 0.880531
25% 0.713675 0.061224 0.764706 0.508251 0.943253 0.631433 0.734082 0.654088 1.000000 0.943590 0.823529 0.039823 0.887179 0.340708 0.938776
50% 0.821974 0.849403 0.811091 0.825273 0.968270 0.810234 0.991935 0.743933 1.000000 0.976836 0.850219 0.843700 0.933424 0.742309 0.980334
75% 0.965665 0.969828 0.898824 0.970976 0.985207 0.953390 1.000000 0.865079 1.000000 1.000000 0.978261 0.978261 0.965217 0.943590 1.000000
max 0.984694 0.991329 0.973913 0.989011 0.988806 0.979695 1.000000 1.000000 1.000000 1.000000 0.991786 1.000000 0.982808 0.978261 1.000000

In [38]:
res = compare_extractors_for_download( 461179954 )
res


[181, 182, 190, 191, 192, 193, 194, 195, 196, 197]
[190, 191, 192, 193, 194, 195, 196, 197]
[190, 191, 192, 193, 194, 195, 196, 197, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1172, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1189, 1191, 1192, 1193, 1194, 1195, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 3503, 3504, 3505, 3506, 3507, 3508, 3509, 3510, 3569, 3570, 3571, 3572, 3573]
original article tmp file  /tmp/tmpGyoriN.html
/tmp/tmpDvN04b.html
/tmp/tmpGyoriN.html
/tmp/tmpGyoriN.html
Now open file:///tmp/tmpDvN04b.html in your web browser

Out[38]:
{'boiler_pipe': {'f1': 0.05381484592027909,
  'precision': 0.027653288185107726,
  'recall': 0.9976047904191617},
 'crf': {'f1': 0.3154066985645933,
  'precision': 0.1876993166287016,
  'recall': 0.9868263473053892},
 'downloads_id': 461179954,
 'heur': {'f1': 0.9933694996986137,
  'precision': 1.0,
  'recall': 0.9868263473053892},
 'py_goose': {'f1': 0.9951865222623345,
  'precision': 1.0,
  'recall': 0.9904191616766467},
 'python_readibilty': {'f1': 0.2412060301507538,
  'precision': 0.3010752688172043,
  'recall': 0.20119760479041915}}

In [39]:
def gen_data(downloads_id, included_line_numbers):
    
    heuristic_training_ip = []
    c_t_ip = []
    h_t_ip = []
    
    try:
        
        #api_key = ''
        loc_key = api_key
    
        download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
        raw_content = download.json()[0][u'raw_content']
        stories_id = download.json()[0][u'stories_id']
    
        story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
        title = story.json()[0][u'title']
        description = story.json()[0][u'description']
        url = story.json()[0][u'url']

        story_lines_params = {'key':loc_key, 'body_html':raw_content}
        
        headers = {'Content-type': 'application/json'}
        story_lines = requests.put('https://api.mediacloud.org/api/v2/extractlines/story_lines',data=story_lines_params, headers=headers)

        #story_lines = requests.get('https://api.mediacloud.org/api/v2/extractlines/story_lines',params=story_lines_params)
        
        preprocessed_lines = story_lines.text

        heur_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'HeuristicExtractor'}
        heur_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=heur_extract_params)

        crf_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'CrfExtractor'}
        crf_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=crf_extract_params)

        for ln, hscore in enumerate(heur_extract.json()[u'scores']):

            t = 1 if str(ln) in included_line_numbers else 0

            if hscore[u'autoexcluded'] != 1:
                h_t_ip.append( (t, hscore[u'include_probability']) )

            cscore = crf_extract.json()[u'scores'][ln]
            if u'autoexcluded' not in cscore:
                c_t_ip.append( (t, cscore[u'include_probability']) ) 
    
    except Exception as e:
        
        pass

    return h_t_ip, c_t_ip

In [40]:
import mediacloud, requests, csv, sys, os, json, cPickle
from pyroc import *

#extractor_training_lines_checked has the training lines for downloads for which the highest line listed as 'included' was less than the number of lines in the download (max(included_line_numbers) < len(story_lines.json()))
f = open("extractor_training_lines_checked.json").read()
reader = json.loads(f)
heur = []
cPickle.dump(heur, open("heur.p", "wb"))
crf = []
cPickle.dump(crf, open("crf.p", "wb"))
done = []
cPickle.dump(done, open("done.p", "wb"))

for row in reader[:30]:
    
    did = row[u'downloads_id']
    lns = row[u'included_line_numbers']
    
    curh, curc = gen_data(did, lns)
    
    heur+=curh
    crf+=curc
    done.append(did)
    
cPickle.dump(done, open("done.p", "wb"))
cPickle.dump(heur, open("heur.p", "wb"))
cPickle.dump(crf, open("crf.p", "wb"))

In [41]:
import cPickle
from pyroc import *

heur = cPickle.load(open("heur.p","rb"))
crf = cPickle.load(open("crf.p","rb"))

rocheur = ROCData(heur)
roccrf = ROCData(crf)


print rocheur.auc()

plot_multiple_roc(rocList=(rocheur,roccrf), title='Extractor ROC Curve', labels=("heuristic curve","crf curve"))


0.0

In [74]:
annotation = {u'end_offset': 61, 
             u'start_offset': 0,
             u'end_xpath':   u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/h1[1]/text()[1]',
             u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/h1[1]/text()[1]'}
annotation = {u'end_offset': 28, u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[3]/text()[1]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[1]/text()[1]'}

annotation = {u'end_offset': 26, u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[15]/text()[2]', u'start_offset': 1, u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/span[2]/p[1]/text()[1]'}
annotation = {u'end_offset': 272, u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[2]/text()[1]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[1]/text()[1]'}

print annotation

get_annotated_text( raw_content, annotation )


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-74-d2e2ed8d91bd> in <module>()
     10 print annotation
     11 
---> 12 get_annotated_text( raw_content, annotation )

<ipython-input-18-c466c1e999a7> in get_annotated_text(raw_content, annotation)
      6     #print annotation[ 'start_xpath' ]
      7 
----> 8     start_container = root.xpath( annotation[ 'start_xpath' ] )[0]
      9     end_container   = root.xpath( annotation[ 'end_xpath' ] )[0]
     10 

IndexError: list index out of range
{u'end_offset': 272, u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[2]/text()[1]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[1]/text()[1]'}