In [1]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import pyspark.ml.feature as feature
In [2]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
notes = sqlContext.read.parquet("../data/idigbio_notes.parquet")
total_records = notes.count()
print(total_records)
# Small sample of the df
notes = notes.sample(withReplacement=False, fraction=0.001)
notes.cache()
print(notes.count())
In [3]:
for r in notes.head(20):
print(r['document'] + "\n")
In [4]:
notes_pdf = notes.toPandas()
In [5]:
def tokenize(s):
'''
Take a string and return a list of tokens split out from it
with the nltk library
'''
if s is not None:
return nltk.tokenize.word_tokenize(s)
else:
return ""
notes_pdf['tokens'] = map(tokenize, notes_pdf['document'])
In [6]:
print(notes_pdf.head()['tokens'])
In [7]:
def part_of_speech(t):
'''
With a list of tokens, mark their part of speech and return
a list of tuples.
'''
return nltk.pos_tag(t)
notes_pdf['pos'] = map(part_of_speech, notes_pdf['tokens'])
In [8]:
print(notes_pdf.head()['pos'])
In [9]:
def chunk(p):
return nltk.chunk.ne_chunk(p)
notes_pdf['chunks'] = map(chunk, notes_pdf['pos'])
In [10]:
print(notes_pdf.head()['chunks'])
Now, with some chunks, can we find any that match ones from darwinCore text? Use word2vec on the Dude, this is a Hard Problem. Need ontology lookup service's code: http://www.ebi.ac.uk/ols/beta/search?q=puma&groupField=iri&start=0&ontology=envo
In [11]:
# https://github.com/alvations/pywsd
# This uses it's own term definitions
from pywsd.similarity import max_similarity
s = """locality The specific description of the place. Less specific geographic information can be
provided in other geographic terms (higherGeography, continent, country, stateProvince, county,
municipality, waterBody, island, islandGroup). This term may
contain information modified from the original to correct perceived errors or standardize the description."""
In [12]:
print(max_similarity(s, 'town', 'lin'))
In [18]:
def find_triples(s):
'''
Find s-v-p triples in taged list of tokens, returns
list of dicts with the found triples.
'''
triples = []
t = {}
for (token, tag) in s:
if tag.startswith("NN"):
t["subject"] = token
#else:
# triples.append(t)
# t = {}
return triples
for s in notes_pdf.head(1)['chunks']:
print(s)
print(find_triples(notes_pdf.head(1)['chunks']))