Ministry of Education News
Script to parse news on education.govt.nz
In [26]:
import bs4
import requests
import os
import mammoth
In [26]:
In [27]:
opnewsfo = ('/media/removable/lemonyellow/educ/www.education.govt.nz/news')
In [28]:
osliz = os.listdir(opnewsfo)
In [29]:
osliz
Out[29]:
In [30]:
osrem = osliz.remove('index.html')
In [31]:
osrem
In [61]:
for repoz in osliz:
#print repoz
indef = os.listdir(opnewsfo + '/' + repoz + '/')
for ind in indef:
opso = open(opnewsfo + '/' + repoz + '/' + ind, 'r')
souprep = bs4.BeautifulSoup(opso)
#print souprep
#Instead of just getting class I need it to return specific
#class - intro./
for link in souprep.find_all('p', class_="intro"):
print link
#print(link.get('class')):
# print (link.get('class'))
#link = souprep.find_all('p')
#print(link.('class="intro"'))
#print souprep.prettify
#print souprep.text
In [41]:
with open("/home/wcmckee/Downloads/test.docx", "r") as docx_file:
result = mammoth.extract_raw_text(docx_file)
text = result.value # The raw text
messages = result.messages # Any messages
In [42]:
import bs4
In [43]:
soudocx = bs4.BeautifulSoup(html)
In [44]:
soupnop = soudocx.findAll('p')[1:]
In [20]:
In [21]:
for sonp in soupnop:
print sonp.text
In [ ]: