In [1]:
import os
import bs4


# encoding=utf8  
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

In [2]:
def map_data(name):
	"""
	Maps all stories in the data folder for the content provider
	"""
	data_dir = "data/" + name

	return os.listdir(data_dir)

In [8]:
print 'hello'

In [3]:
files[1]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-d8cd9fb2397b> in <module>()
----> 1 files[1]

NameError: name 'files' is not defined

In [ ]:
extractlist= []
files = map_data('dailyobserver')
def extract_dailyobserver(files):
    for doc in files:
        with open('data/dailyobserver/' + doc, 'r') as f:
            print doc
            try:
                soup = bs4.BeautifulSoup(f)
                title = title_cleaner(soup)
                content = contents_cleaner(soup)
                extractlist.append([title, content])
            except Exception as ex:
                print unicode(title), ex.__class__

In [ ]:
extract_dailyobserver(files)

In [ ]:
extractlist[:5][4][2]

Titles


In [ ]:
def title_cleaner(soup):
    title = soup.find(name="h1", attrs={'class':'title'})
    return title.text

In [ ]:
title = soup.find(name="h1", attrs={'class':'title'})

In [ ]:
title.text

In [ ]:
tags = ['<h1 class="title">', '</h1>' ]

In [ ]:
tags

In [ ]:
extractlist[2] = unicode(extractlist[2]).replace(tags[0], '').replace(tags[1], '')

In [ ]:
title_cleaner(soup)

Author


In [ ]:
body = soup.find(name="div", attrs={'class':'field-item even'})

In [ ]:
content = body.findAll(name='p')

In [7]:
print 'hello'

In [6]:
text


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-e1cbb0c3879a> in <module>()
----> 1 text

NameError: name 'text' is not defined

Content


In [203]:
def contents_cleaner(soup):
    contents = soup.findAll(name='div', attrs={'class':"field-item even"})#     title = str(title)
    return contents[0].text

In [190]:
contents = soup.findAll(name='div', attrs={'class':"field-item even"})

In [224]:
contents[0].text


Out[224]:
u'Nearly every week in recent times the media have carried major stories on the seizure of marijuana somewhere in Liberia.\xa0\nYesterday\u2019s back page lead story on Daily Observer\u2019s was no exception. \xa0Our reporter \xa0\xa0Alvin Worzi told the nation in that story yesterday that the Drug Enforcement Agency (DEA) had over the weekend discovered L$600,000 worth of marijuana in the ELWA Community.\xa0\nEven more alarming was DEA Director Anthony Souh\u2019s declaration that supply of the contraband (illegal goods) in the country was widespread, to the extent that it was being grown on several farms around the country.\xa0 There are indications that such farms are located, for starts, right here in Montserrado, but also in Bong, Lofa and Nimba counties.\xa0 This suggests that this nation is in for a long, tough, intractable battle with marijuana suppliers, if our youth are to be protected from this most dangerous scourge. \xa0\nMany studies over the years have confirmed extremely harmful effects of marijuana on the brain, especially of young people.\xa0 That is why the drug has been banned for decades in the United States and many other parts of the world.\nA study published last Wednesday in the Journal of Neuroscience through the collaborative effort between Northwestern University\'s Medical School, Massachusetts General Hospital and Harvard Medical School showed a direct correlation between the number of times users smoked and abnormalities in the brain.\nYoung, casual marijuana smokers experience potentially harmful changes to their brains, with the drug altering regions of the mind related to motivation and emotion, researchers found.\n"What we\'re seeing is changes in people who are 18 to 25 in core brain regions that you never, ever want to fool around with," said co-senior study author Dr. Hans Beiter, professor of psychiatry and behavioral sciences at Northwestern University.\nIn particular, the study identified changes to the nucleus accumbens and the nucleus amygdala, regions of the brain that are key to regulating emotion and motivation, in marijuana users who smoke between one and seven joints a week.\nThe researchers found changes to the volume, shape and density of those brain regions. But more studies are needed to determine how those changes may have long-term consequences and whether they can be fixed with abstinence, Beiter said.\nAt this low stage of Liberia\u2019s development, when so many newer nations are far ahead of us in so many respects, and our educational system is confronted with so many rudimentary challenges, we cannot afford this further distraction that could destroy the thinking faculties and the normal function of our children\u2019s brains.\nBut this is a challenge not for the government alone; all educational, religious, civic and social organizations must get involved in the fight against this blight which, if not tackled immediately, could have a devastating effect on our youth and our national progress.\nThe Ministries of Justice, Education, Health and Social Welfare, the National Drug Enforcement Agency, the Liberia National Students Union (LINSU), the National Students Christian Council \xa0\xa0(NSCC) and other related Non-Governmental Organizations need to join forces and develop a coherent and workable strategy to combat the spread of marijuana in our country.\nThis is an urgent challenge.\xa0\xa0\xa0 The sooner we can organize the forces to begin strategizing to arrest this scourge, the better for our nation\u2019s future.\n'

Extraction List


In [206]:



Out[206]:
[]

In [254]:
for elem in range(5):
    print extractlist[:][0][0]


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-254-581484b181df> in <module>()
      1 for elem in range(5):
----> 2     print extractlist[:elem][0][0]

IndexError: list index out of range

Notes

if title contains, 'edition' or month then it is a 404


In [9]:
soup.find(name='div', attrs={'class':"node-content-wrapper"})


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-594705d7d14c> in <module>()
----> 1 soup.find(name='div', attrs={'class':"node-content-wrapper"})

NameError: name 'soup' is not defined

In [ ]: