In [1]:
google_news_training_data_root = "/home/dlarochelle/Downloads/boiler_pipe"

In [8]:
import os

original  = sorted( os.listdir( google_news_training_data_root + "/original" ) )
annotated = sorted ( os.listdir( google_news_training_data_root + "/annotated" ) )

assert len( original ) == len( annotated )

assert original == annotated

In [62]:
from lxml import html
from lxml import etree
import codecs

files = 0

google_news_corpus = []

for html_file in annotated:
    f = os.path.join(google_news_training_data_root, "annotated", html_file ), "r", "utf-8")
    #print html_file
    contents_annotated =
    tree = html.fromstring( contents_annotated )   
    spans = tree.xpath('//span[@class="x-nc-sel3" or @class="x-nc-sel1"  or @class="x-nc-sel2" ]')
    spanset = set( spans )
    non_dup_spans = [ span for span in spans if not ( span.getparent() in spanset or span.getparent().getparent() in spanset ) ]
    span_strings = [ etree.tostring(s) for s in non_dup_spans ]
    story_html = "\n\n".join( span_strings)
    f = os.path.join(google_news_training_data_root, "original", html_file ), "r", "utf-8" )
    contents_original =
    google_news_corpus.append({ 'filename': html_file, 'expected_html': story_html, 'raw_content': contents_original })
    files += 1
    #if files > 1:
    #    break

#print story_html
#len(google_news_corpus )
#print story_html
#print contents

['<span class="x-nc-sel1">Modi, BJP leaders discuss poll strategies</span>\n',
 '<span class="x-nc-sel3">Press Trust of India</span>\n',
 '<span class="x-nc-sel3">Saturday, December 29, 2007 (New Delhi)</span>\n',
 '<span class="x-nc-sel2">Having crafted a spectacular victory in Gujarat, Chief Minister Narendra Modi on Saturday made his first visit to the capital after the success and met the top BJP brass, which discussed strategies to face elections in about nine states in the new year ahead of the Lok Sabha polls.</span>\n',
 '<span class="x-nc-sel2">Modi, who single-handedly led his party\'s triumph in Gujarat and has been projected ever since to play a leading role in the national scene, met former prime minister Atal Bihari Vajpayee and took his blessings.</span>\n',
 '<span class="x-nc-sel2">From the airport he drove straight to L K Advani\'s residence where the top leaders including Rajnath Singh and Murli Manohar Joshi discussed the party\'s wins in the assembly elections in Gujarat and Himachal Pradesh.</span>\n',
 '<span class="x-nc-sel2">Advani, projected by the BJP as its prime ministerial candidate, had already termed the twin wins as the start of a \'\'process of change\'\' that will take the party to the Centre.</span>\n',
 '<span class="x-nc-sel2">The other leaders present at the meeting were Arun Jaitley, Jaswant Singh and Sushma Swaraj.</span>\n',
 '<span class="x-nc-sel2">\'\'The leaders deliberated on strategies to take forward the enthusiastic support the party has received from the people,\'\' BJP spokesman Ravi Shankar Prasad told reporters in new Delhi.</span>\n',
 '<span class="x-nc-sel2">In the meeting with the central leadership, Modi also discussed the formation of his ministry in Gujarat.</span>\n',
 '<span class="x-nc-sel2">\'\'I discussed cabinet formation in Gujarat,\'\' Modi told reporters.</span>\n',
 '<span class="x-nc-sel2">The leaders also deliberated on ways to take the BJP ideology to the people and to strengthen the NDA.</span>\n',
 '<span class="x-nc-sel2">\'\'We have presented an honest and effective leader -- L K Advani -- to the nation,\'\' Prasad said.</span>\n',
 '<span class="x-nc-sel2">He said a positive atmosphere has been generated in favour of the party following its impressive performance in the recent elections.</span>\n',
 '<span class="x-nc-sel2">The party\'s endeavour will be to take issues like national security and development to the people of the country who have been \'\'suffering\'\' under the UPA rule, Prasad said.</span>\n',
 '<span class="x-nc-sel2">On his arrival to Delhi, Modi, whose stunning victory in Gujarat had fuelled speculation that he would seek a greater role in the party\'s affairs at the Centre, was accorded a rousing welcome by the BJP workers.</span>\n',
 '<span class="x-nc-sel2">Modi was sworn in as Chief Minister on December 25 but is yet to announce his new team of ministers.</span>\n']

In [60]:
span = spans[2]
spanset = set( spans )
[ span for span in spans if span.getparent() in spanset or span.getparent().getparent() in spanset ]

[<Element span at 0x7f6ff8343350>,
 <Element span at 0x7f6ff83430b0>,
 <Element span at 0x7f6ff8343d10>,
 <Element span at 0x7f6ff83439b0>]

In [63]:
import cPickle
import os.path

cPickle.dump( google_news_corpus, file( os.path.expanduser( 'boiler_pipe_google_news_corpus.pickle' ), 'wb' ) )