Seems we can find some from http://www.membersinterests.org.uk/.
In [8]:
url='http://downloads.membersinterests.org.uk/register/170707.zip'
!mkdir -p tmp/
!mkdir -p data/
!wget {url} -O tmp/temp.zip; unzip tmp/temp.zip -d data/ ; rm tmp/temp.zip
In [13]:
#Preview the data
!head -n 3 data/170707.csv
In [15]:
#View data in datatable
import pandas as pd
df=pd.read_csv('data/170707.csv',header=None)
df.columns=['Name','Constituency','Party','URL','Item']
df.head()
Out[15]:
In [16]:
#!pip3 install spacy
#!python3 -m spacy download en
from spacy.en import English
parser = English()
In [17]:
def entities(example, show=False):
if show: print(example)
parsedEx = parser(example)
print("-------------- entities only ---------------")
# if you just want the entities and nothing else, you can do access the parsed examples "ents" property like this:
ents = list(parsedEx.ents)
tags={}
for entity in ents:
#print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))
term=' '.join(t.orth_ for t in entity)
if ' '.join(term) not in tags:
tags[term]=[(entity.label, entity.label_)]
else:
tags[term].append((entity.label, entity.label_))
print(tags)
In [117]:
#Get a single register line item to play with
txt=df.iloc[0]['Item']
txt
Out[117]:
In [25]:
entities(txt, True)
We might then try to reconcile things classed as an ORG using something like OpenCorporates API.
In [116]:
import requests
ocrecURL='http://opencorporates.com/reconcile/gb'
rq=requests.get(ocrecURL,params={'query':'Guardian News & Media'})
rq.json()
Out[116]:
In [22]:
CALAIS_KEY=""
In [118]:
import requests
import json
def calais(text, calaisKey=CALAIS_KEY):
calais_url = 'https://api.thomsonreuters.com/permid/calais'
headers = {'X-AG-Access-Token' : calaisKey, 'Content-Type' : 'text/raw', 'outputformat' : 'application/json'}
response = requests.post(calais_url, files={'file':text}, headers=headers, timeout=80)
return response.json()
In [119]:
def cleaner(txt):
txt=txt.replace('Address of','. Address of')
return txt
In [120]:
oc=calais( cleaner(txt) )
In [121]:
def ocQuickView(oc):
items={}
for k in oc.keys():
if '_typeGroup' in oc[k] and oc[k]['_typeGroup'] in ['entities','relations','socialTag','topics']:
k2=oc[k]['_typeGroup']
if k2 not in items: items[k2]=[]
record={}
#if '_type' in oc[k]:
# record['typ']=oc[k]['_type']
if 'instances' in oc[k]:
record['instances']=[i['exact'] for i in oc[k]['instances'] if 'exact' in i]
for k3 in ['name','address','_type']:
if k3 in oc[k]: record[k3] = oc[k][k3]
items[k2].append(record)
return items
ocQuickView(oc)
Out[121]:
In [122]:
ix=155
txt=cleaner(df.iloc[ix]['Item'])
print('{}\n---\n{}'.format(txt, ocQuickView(calais(txt))))
In [123]:
ix=299
txt=cleaner(df.iloc[ix]['Item'])
print('{}\n---\n{}'.format(txt, ocQuickView(calais(txt))))
In [124]:
ix=863
txt=cleaner(df.iloc[ix]['Item'])
print('{}\n---\n{}'.format(txt, ocQuickView(calais(txt))))
In [97]:
txt="Name of donor: Nael FarargyAddress of donor: privateAmount of donation or nature and value of donation in kind: £20,000 to hire a part time member of staff and meet office and staff expensesDate received: 12 April 2017Date accepted: 12 April 2017Donor status: individual(Registered 18 April 2017) "
txt
Out[97]:
Define a regular expression to pull out the data in structured form if the text conforms to a conventional format.
In [105]:
extractor1='Name of donor:(?P<name>.*)Address of donor:(?P<address>.*)Amount of donation or nature and value of donation in kind:(?P<amount>.*)Date received:(?P<rxd>.*)Date accepted:(?P<accptd>.*)Donor status(?P<status>.*)'
In [106]:
import re
r=re.compile(extractor1)
In [113]:
r.match(txt).groupdict()
#Looking at the response values, we could catch for whitespace in the regex or do a cleaning pass to strip whitespace
Out[113]:
We could also add in further parsing to try to identify the actual amount and the rationale for amount items, as well as further structuring the status field. Casting dates to datetimes would also make sense.
There may be other conventional forms in register entries, for which alternative regular expressions ould be defined.
Having got structured data out, we could start to put it into a database and then make queries over it.
In [ ]: