In [1]:
    
import os
import bs4
# encoding=utf8  
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')
    
In [2]:
    
def map_data(name):
	"""
	Maps all stories in the data folder for the content provider
	"""
	data_dir = "data/" + name
	return os.listdir(data_dir)
    
In [8]:
    
print 'hello'
    
In [3]:
    
files[1]
    
    
In [ ]:
    
extractlist= []
files = map_data('dailyobserver')
def extract_dailyobserver(files):
    for doc in files:
        with open('data/dailyobserver/' + doc, 'r') as f:
            print doc
            try:
                soup = bs4.BeautifulSoup(f)
                title = title_cleaner(soup)
                content = contents_cleaner(soup)
                extractlist.append([title, content])
            except Exception as ex:
                print unicode(title), ex.__class__
    
In [ ]:
    
extract_dailyobserver(files)
    
In [ ]:
    
extractlist[:5][4][2]
    
In [ ]:
    
def title_cleaner(soup):
    title = soup.find(name="h1", attrs={'class':'title'})
    return title.text
    
In [ ]:
    
title = soup.find(name="h1", attrs={'class':'title'})
    
In [ ]:
    
title.text
    
In [ ]:
    
tags = ['<h1 class="title">', '</h1>' ]
    
In [ ]:
    
tags
    
In [ ]:
    
extractlist[2] = unicode(extractlist[2]).replace(tags[0], '').replace(tags[1], '')
    
In [ ]:
    
title_cleaner(soup)
    
In [ ]:
    
body = soup.find(name="div", attrs={'class':'field-item even'})
    
In [ ]:
    
content = body.findAll(name='p')
    
In [7]:
    
print 'hello'
    
In [6]:
    
text
    
    
In [203]:
    
def contents_cleaner(soup):
    contents = soup.findAll(name='div', attrs={'class':"field-item even"})#     title = str(title)
    return contents[0].text
    
In [190]:
    
contents = soup.findAll(name='div', attrs={'class':"field-item even"})
    
In [224]:
    
contents[0].text
    
    Out[224]:
In [206]:
    
    
    Out[206]:
In [254]:
    
for elem in range(5):
    print extractlist[:][0][0]
    
    
if title contains, 'edition' or month then it is a 404
In [9]:
    
soup.find(name='div', attrs={'class':"node-content-wrapper"})
    
    
In [ ]: