Ministry of Education News
Script to parse news on education.govt.nz
In [75]:
import bs4
import requests
import os
import mammoth
In [26]:
In [27]:
opnewsfo = ('/media/removable/lemonyellow/educ/www.education.govt.nz/news')
In [28]:
osliz = os.listdir(opnewsfo)
In [29]:
osliz
Out[29]:
In [30]:
osrem = osliz.remove('index.html')
In [31]:
osrem
In [73]:
firlpar = list()
In [95]:
for repoz in osliz:
#print repoz
indef = os.listdir(opnewsfo + '/' + repoz + '/')
for ind in indef:
opso = open(opnewsfo + '/' + repoz + '/' + ind, 'r')
souprep = bs4.BeautifulSoup(opso)
#a = BeautifulSoup.BeautifulSoup("<html><body><script>aaa</script></body></html>")
#print souprep
#Instead of just getting class I need it to return specific
#class - intro./
for link in souprep.find_all('p', class_="intro"):
print link
#print lifi
#[x.extract() for x in a.findAll('p')]
#print link.text
firlpar.append(link.text)
#print link.attrs('href')
#print link.next_element
#for link in souprep.find_all('p'):
#print link
#print(link.get('class')):
# print (link.get('class'))
#link = souprep.find_all('p')
#print(link.('class="intro"'))
#print souprep.prettify
#print souprep.text
In [94]:
link.unwrap
Out[94]:
In [ ]:
#a = BeautifulSoup.BeautifulSoup("<html><body><script>aaa</script></body></html>")
#[x.extract() for x in a.findAll('"intro"')]
In [81]:
for firl in firlpar:
print firl.replace(' ', '')
In [41]:
with open("/home/wcmckee/Downloads/test.docx", "r") as docx_file:
result = mammoth.extract_raw_text(docx_file)
text = result.value # The raw text
messages = result.messages # Any messages
In [42]:
import bs4
In [43]:
soudocx = bs4.BeautifulSoup(html)
In [44]:
soupnop = soudocx.findAll('p')[1:]
In [20]:
In [21]:
for sonp in soupnop:
print sonp.text
In [ ]: