Ministry of Education News
Script to parse news on education.govt.nz
In [26]:
    
import bs4
import requests
import os
import mammoth
    
In [26]:
    
    
In [27]:
    
opnewsfo = ('/media/removable/lemonyellow/educ/www.education.govt.nz/news')
    
In [28]:
    
osliz = os.listdir(opnewsfo)
    
In [29]:
    
osliz
    
    Out[29]:
In [30]:
    
osrem = osliz.remove('index.html')
    
In [31]:
    
osrem
    
In [61]:
    
for repoz in osliz:
    #print repoz
    indef =  os.listdir(opnewsfo +  '/' + repoz + '/')
    for ind in indef:
        opso = open(opnewsfo +  '/' + repoz + '/' + ind, 'r')
        souprep = bs4.BeautifulSoup(opso)
        #print souprep
        #Instead of just getting class I need it to return specific 
        #class - intro./
        
        for link in souprep.find_all('p', class_="intro"):
            print link
            #print(link.get('class')):
        #    print (link.get('class'))
        #link = souprep.find_all('p')
        #print(link.('class="intro"'))
        
        #print souprep.prettify
        #print souprep.text
    
    
In [41]:
    
with open("/home/wcmckee/Downloads/test.docx", "r") as docx_file:
    result = mammoth.extract_raw_text(docx_file)
    text = result.value # The raw text
    messages = result.messages # Any messages
    
In [42]:
    
import bs4
    
In [43]:
    
soudocx = bs4.BeautifulSoup(html)
    
In [44]:
    
soupnop = soudocx.findAll('p')[1:]
    
In [20]:
    
    
In [21]:
    
for sonp in soupnop:
    print sonp.text
    
    
In [ ]: