In [1]:
import os
import bs4
# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
In [2]:
def map_data(name):
"""
Maps all stories in the data folder for the content provider
"""
data_dir = "data/" + name
return os.listdir(data_dir)
In [8]:
print 'hello'
In [3]:
files[1]
In [ ]:
extractlist= []
files = map_data('dailyobserver')
def extract_dailyobserver(files):
for doc in files:
with open('data/dailyobserver/' + doc, 'r') as f:
print doc
try:
soup = bs4.BeautifulSoup(f)
title = title_cleaner(soup)
content = contents_cleaner(soup)
extractlist.append([title, content])
except Exception as ex:
print unicode(title), ex.__class__
In [ ]:
extract_dailyobserver(files)
In [ ]:
extractlist[:5][4][2]
In [ ]:
def title_cleaner(soup):
title = soup.find(name="h1", attrs={'class':'title'})
return title.text
In [ ]:
title = soup.find(name="h1", attrs={'class':'title'})
In [ ]:
title.text
In [ ]:
tags = ['<h1 class="title">', '</h1>' ]
In [ ]:
tags
In [ ]:
extractlist[2] = unicode(extractlist[2]).replace(tags[0], '').replace(tags[1], '')
In [ ]:
title_cleaner(soup)
In [ ]:
body = soup.find(name="div", attrs={'class':'field-item even'})
In [ ]:
content = body.findAll(name='p')
In [7]:
print 'hello'
In [6]:
text
In [203]:
def contents_cleaner(soup):
contents = soup.findAll(name='div', attrs={'class':"field-item even"})# title = str(title)
return contents[0].text
In [190]:
contents = soup.findAll(name='div', attrs={'class':"field-item even"})
In [224]:
contents[0].text
Out[224]:
In [206]:
Out[206]:
In [254]:
for elem in range(5):
print extractlist[:][0][0]
if title contains, 'edition' or month then it is a 404
In [9]:
soup.find(name='div', attrs={'class':"node-content-wrapper"})
In [ ]: