parse reserve bank blog
In [1]:
    
import requests
import xmltodict
import bs4
from bs4 import BeautifulSoup
import getpass
import nltk
    
In [2]:
    
myusr = getpass.getuser()
    
In [3]:
    
myreq = requests.get('http://www.rbnz.govt.nz/feeds/news')
    
In [4]:
    
xlpars = xmltodict.parse(myreq.text)
    
In [5]:
    
xlitem = xlpars['rss']['channel']['item']
    
In [6]:
    
blogtxt = list()
    
In [8]:
    
for xli in xlitem:
    tit = (xli['title'])
    titslug = tit.replace(' ', '-')
    print(titslug)
    print(xli['link'])
    myxl = requests.get(xli['link'])
    #myso = bs4.BeautifulSoup(myxl)
    #print(myso.)
    #print(myxl.text)
    #print(myxl)
    putime = (xli['pubDate'])
    
    with open('/home/{}/artctrl/posts/{}.meta'.format(myusr, titslug[0:15]), 'w') as rbn:
        rbn.write('..title: {}\n.. slug: {}\n.. date: {}\n'.format(titslug, titslug, putime))
        rbn.write(xli['description'])
        
#    .. title: wer
#.. slug: wer
#.. date: 2017-07-30 00:56:50 UTC+12:00
#.. tags: 
#.. link: 
#.. description: 
#.. type: text
    #with open(newfile, 'w') as outfile
        
        
    
    
    soup = BeautifulSoup(myxl.text, 'html.parser')
    #print(soup)
    finpo = soup.find_all('p')
    
    finp = finpo[:2]
    #finp.text
    with open('/home/{}/artctrl/posts/{}.rst'.format(myusr, titslug[0:15]), 'w') as rbn:
        rbn.write('{}\n\n{}\n\n{}'.format(titslug, xli['description'], finp))
    
    for finp in finpo[:2]:
        print(finp.text)
        blogtxt.append(finp.text)
        #with open('/home/{}/rbnz/posts/{}.rst'.format(myusr, titslug[0:15]), 'a') as rbn:
        #    rbn.write('{\n\n{}\n\n'.format(str(finp.text)))
        #rbn.write(xli['description'])
        
        
    #print(finpo[:2])
    #refError = soup.findAll('span', { 'class': 'mw-ext-cite-error'})
    #print(xli['pubDate'])
    #print(xli['description'])
    
    
    
In [ ]:
    
mybltx = ' '.join(blogtxt)
    
In [ ]:
    
nltool = nltk.word_tokenize(mybltx)
    
In [ ]:
    
tagged = nltk.pos_tag(nltool)
    
In [1]:
    
word_tag_pairs = nltk.bigrams(nbrown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]