parse reserve bank blog
In [1]:
import requests
import xmltodict
import bs4
from bs4 import BeautifulSoup
import getpass
import nltk
In [2]:
myusr = getpass.getuser()
In [3]:
myreq = requests.get('http://www.rbnz.govt.nz/feeds/news')
In [4]:
xlpars = xmltodict.parse(myreq.text)
In [5]:
xlitem = xlpars['rss']['channel']['item']
In [6]:
blogtxt = list()
In [8]:
for xli in xlitem:
tit = (xli['title'])
titslug = tit.replace(' ', '-')
print(titslug)
print(xli['link'])
myxl = requests.get(xli['link'])
#myso = bs4.BeautifulSoup(myxl)
#print(myso.)
#print(myxl.text)
#print(myxl)
putime = (xli['pubDate'])
with open('/home/{}/artctrl/posts/{}.meta'.format(myusr, titslug[0:15]), 'w') as rbn:
rbn.write('..title: {}\n.. slug: {}\n.. date: {}\n'.format(titslug, titslug, putime))
rbn.write(xli['description'])
# .. title: wer
#.. slug: wer
#.. date: 2017-07-30 00:56:50 UTC+12:00
#.. tags:
#.. link:
#.. description:
#.. type: text
#with open(newfile, 'w') as outfile
soup = BeautifulSoup(myxl.text, 'html.parser')
#print(soup)
finpo = soup.find_all('p')
finp = finpo[:2]
#finp.text
with open('/home/{}/artctrl/posts/{}.rst'.format(myusr, titslug[0:15]), 'w') as rbn:
rbn.write('{}\n\n{}\n\n{}'.format(titslug, xli['description'], finp))
for finp in finpo[:2]:
print(finp.text)
blogtxt.append(finp.text)
#with open('/home/{}/rbnz/posts/{}.rst'.format(myusr, titslug[0:15]), 'a') as rbn:
# rbn.write('{\n\n{}\n\n'.format(str(finp.text)))
#rbn.write(xli['description'])
#print(finpo[:2])
#refError = soup.findAll('span', { 'class': 'mw-ext-cite-error'})
#print(xli['pubDate'])
#print(xli['description'])
In [ ]:
mybltx = ' '.join(blogtxt)
In [ ]:
nltool = nltk.word_tokenize(mybltx)
In [ ]:
tagged = nltk.pos_tag(nltool)
In [1]:
word_tag_pairs = nltk.bigrams(nbrown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]