CSV parse


In [1]:
from helpers import get_test_doc

geodoc = get_test_doc()


Number of rows: 266 (Number with a lat/long: 241)

Let's see what the last line looks like:


In [2]:
geodoc[-1]


Out[2]:
{'Dublin Core:Date': '31/07/1841',
 'Item Type Metadata:Date of meeting': '',
 'Item Type Metadata:Newspaper': 'Northern Star',
 'Item Type Metadata:NewspaperText': 'LIVERPOOL. - Mr. B. McCartney will lecture in the Association Room, Preston-Street, on Wednesday evening next.',
 'Item Type Metadata:Place': 'Liverpool',
 'geolocation:address': '',
 'geolocation:latitude': '53.4054719',
 'geolocation:longitude': '-2.9805393',
 'geolocation:map_type': '',
 'geolocation:zoom_level': '7',
 'itemType': 'Meeting'}

In [3]:
# Having to pull the Stanford parser from local, as BL web proxy blocks it. Sods.
import os
import nltk

from helpers import get_test_doc

geodoc = get_test_doc()

sentences = geodoc[102]['Item Type Metadata:NewspaperText'].split(".")
print("Split into {0} sentences".format(len(sentences)))

# Take the second sentence:
tokens = nltk.word_tokenize(sentences[1])
print(tokens)

tagged = nltk.pos_tag(tokens)

print(tagged)


Number of rows: 266 (Number with a lat/long: 241)
Split into 8 sentences
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
<ipython-input-3-69ccc718ac87> in <module>()
     11 
     12 # Take the second sentence:
---> 13 tokens = nltk.word_tokenize(sentences[1])
     14 print(tokens)
     15 

C:\Python34\lib\site-packages\nltk\tokenize\__init__.py in word_tokenize(text, language)
     99     :param language: the model name in the Punkt corpus
    100     """
--> 101     return [token for sent in sent_tokenize(text, language)
    102             for token in _treebank_word_tokenize(sent)]
    103 

C:\Python34\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)
     83     :param language: the model name in the Punkt corpus
     84     """
---> 85     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
     86     return tokenizer.tokenize(text)
     87 

C:\Python34\lib\site-packages\nltk\data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
    779 
    780     # Load the resource.
--> 781     opened_resource = _open(resource_url)
    782 
    783     if format == 'raw':

C:\Python34\lib\site-packages\nltk\data.py in _open(resource_url)
    893 
    894     if protocol is None or protocol.lower() == 'nltk':
--> 895         return find(path_, path + ['']).open()
    896     elif protocol.lower() == 'file':
    897         # urllib might not use mode='rb', so handle this one ourselves:

C:\Python34\lib\site-packages\nltk\data.py in find(resource_name, paths)
    622     sep = '*'*70
    623     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 624     raise LookupError(resource_not_found)
    625 
    626 def retrieve(resource_url, filename=None, verbose=True):

LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/english.pickle' not found.  Please
  use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - 'C:\\Users\\Katrina/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Python34\\nltk_data'
    - 'C:\\Python34\\lib\\nltk_data'
    - 'C:\\Users\\Katrina\\AppData\\Roaming\\nltk_data'
    - ''
**********************************************************************

In [ ]:


In [ ]:
geodoc[102]

In [ ]: