Identifying relationships in notes

Goal of this notebook is to determine how successful we can find relationships


In [1]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [18]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
notes = sqlContext.read.parquet("../data/idigbio_notes.parquet")
total_records = notes.count()
print(total_records)
# Small sample of the df
notes = notes.sample(withReplacement=False, fraction=0.1)
notes.cache()
print(notes.count())


3230857
323356

In [19]:
# Still have some problems with the document field having nulls and
# being empty. Not sure where nulls came from but likely the 
# empties are really whitespace
#notes = notes.select(sql.trim(notes["document"]).alias("document"))\
#    .dropna(subset="document")\
#    .filter(sql.length("document") > 0)


#print(notes.count())

# Moved the above to the text_characterization notebook and re-wrote parquet
# should be clean now.

notes.select(notes["document"])\
    .orderBy(notes["document"])\
    .show(10, truncate=False)
notes.select(notes["document"])\
    .orderBy(notes["document"], ascending=False)\
    .show(10, truncate=False)


+--------+
|document|
+--------+
|!       |
|!       |
|!       |
|!       |
|!       |
|!       |
|!       |
|!       |
|!       |
|!       |
+--------+
only showing top 10 rows

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|♂                                                                                                                                                                                                                                                        |
|♀; On same sheet as DS 136                                                                                                                                                                                                                               |
|♀ [BARRO COLORADO-C.Z. Lights:/SM-L. 30 IV 1974 Coll. H. Wolda]                                                                                                                                                                                          |
|♀                                                                                                                                                                                                                                                        |
|♀                                                                                                                                                                                                                                                        |
|♀                                                                                                                                                                                                                                                        |
|“Notes presumably authored by the collector/determiner and accompanying specimen(s) associated with this locality refer to “Sisibarat.” The meaning of this term is not clear but may provide additional information concerning the collecting locality.”|
|árbol de 6 m de alto;flor blanca con amarillo                                                                                                                                                                                                            |
|Árvore.                                                                                                                                                                                                                                                  |
|Árvore, flor alvescente.                                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 10 rows

Sentence detection

Does splitting in to sentences matter? Is there enough information to do this with a natural language library or should things like "," "[]", and "{}" be worked in to address semi-structured data?

Tokenize documents


In [4]:
from lib.tokens import Tokens
tokens = Tokens()
t = tokens.tokenize("Hello, my name is Mace Windoo")
print(t)


['Hello', ',', 'my', 'name', 'is', 'Mace', 'Windoo']

In [5]:
udf_tokenize = sql.udf(tokens.tokenize, types.ArrayType(types.StringType()))

notes_w_tokens = notes.withColumn('tokens', udf_tokenize(notes['document']))
for r in notes_w_tokens.select(notes_w_tokens["tokens"]).head(50):
    print(" | ".join(r["tokens"]))
    print("\n")


returned | specimens | identified | by | RES


[ | 01°44'28 | '' | S | 51°27'21.3 | '' | W | BRASIL | : | PA | , | Melgaço | , | Caxiuanã | Estação | Cient. | Ferreira | Pena | Trilha | , | 19-22.xi.2003 | YPT | APAguiar | & | JDias | P05091 | ]


Plot | : | S18. | Photo | : | yes | .


5 | of | 81 | specs. | to | USNM | , | 86-3126 | ; | 5 | of | 81 | specs. | to | Kochi | U. | , | 86-3127 | ; | alcohol | specs. | in | 2 | jars. | Cruise | 18B | , | sta. | 770 | ; | same | as | sta. | LK | 66-120 | .


Frequent. | In | sandy | soil | on | ridge | top | in | rough | breaks | with | Cercocarpus | montanus | , | Parthenium | alpinum | and | Musineon | .


[ | 23°38'56.9 | '' | S | 45°53'48.8 | '' | W | , | BRASIL | : | SP | , | Salesópolis | , | Res. | , | Biol. | Boracéia | , | Trilha | dos | Pilões | , | P2.2 | , | margem | do | rio | , | 12-15.i.2003 | , | YPT | , | 863m | , | ACC | Macedo | & | JS | Freitas | ]


Substrate | : | intact | coarse | down | wood | > | 5 | centimeter | diameter | , | decay | class | 1-2. | Growth | Vigor | : | non-reproductive | and | vigorous | .


Small | shrubby | tree | along | road | .


along | permanent | stream


On | moist | and | diffusely | lit | boulder | .


BMNH | ( | E | ) | 1016108 | ; | Arnold | Coll. | B.M.1934-354


Measurements | : | 191-85-22-0


Sandy | flood | plain | of | the | Platte | River | .


locality | info | appears | to | be | incorrect | ... | ( | on | label | ) | N | Mexico | or | should | it | be | National | Monument | ... | db


Shrub | 1 | m. | Corolla | translucent | white


down | wood | ; | Tiffany | field | notebook


Number | in | Set | : | 1 | ; | A | common | erect | perennial | herb | ; | flowers | brackish | .


original | data | lost


Herb. | Brasil. | Regnell. | Musei | bot. | Stockholm.\r\n\r\nExped. | Imae | Regnellian. | Lichenes


Rep. | 1-B | ; | genitalia | slide | 178 | [ | FLORIDA | : | Leon | Co. | , | Tall | TimberRes. | Sta. | ; | added | 12-JUL-2002 | ] | [ | 24 | IX | 69 | ; | added | 12-JUL-2002 | ]


under | pine | slab | at | abandoned | sawdust | pile


Field | notes | on | file | .


Handwritten | label | indicates | that | specimen | was | part | of | the | Cross | , | Dawson | , | or | Potts | collections | ( | fide | David | Bettman | ) | . | Label | states | 'Ex | larva | '


Additional | data | on | card


Aquatic | with | purple | petals | .


Additional | data | on | card | .


Also | a | paratype | of | Helminthosporium | glabroides | F. | Stevens | .


Plot | C-17 | .


Frequent


Standard | OIl | Spill | Survey | 1971


compost | pile


Plants | of | Franklin | District | Northwest | Territories | , | Canada


[ | ITALY | Lomb. | L. | Sartirana | Br. | 20.VII.54 | Brivio | ]


elfin | bamboo/mixed | subparamo | litter | berlese | forest | litter


hepialid | webs | on | trees


`` | Fresh | . | ''


flight | intercept | trap


Tree | to | 3 | m | ; | flowers | cream | ; | occasional | .


Rare


Purchased | by | Columbia | College | , | 1885-7


[ | This | collection | is | stored | in | the | indeterminate | Volvariaceae | box | . | ]


Originally | cataloged | as | a | lot | , | see | SU | 2102 | .


[ | 39°59.936'N | 83°02.809'W | USA | : | Ohio | , | Franklin | Co. | Columbus | @ | Kinnear | Rd. | 7-14.ix.2005 | Malaise | , | NF | Johnson | coll | . | ]


savannah | forest | flight | intercept | trap


Stiff | shrub


From | CAS | 6528 | .


found | on | Koeberlinia | spinosa | Zucc. | ( | crown | of | thorns | ) | [ | USDA | ]


Source | : | MRS


TC | 715 | ( | Cekalovic | 's | lot | number | )


Wells | , | Whiting



In [6]:
notes_w_tokens.printSchema()


root
 |-- uuid: string (nullable = true)
 |-- occurrenceID: string (nullable = true)
 |-- catalogNumber: string (nullable = true)
 |-- county: string (nullable = true)
 |-- institutionCode: string (nullable = true)
 |-- country: string (nullable = true)
 |-- countryCode: string (nullable = true)
 |-- stateProvince: string (nullable = true)
 |-- family: string (nullable = true)
 |-- recordedBy: string (nullable = true)
 |-- order: string (nullable = true)
 |-- specificEpithet: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- scientificName: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- fieldNotes: string (nullable = true)
 |-- occurrenceRemarks: string (nullable = true)
 |-- eventRemarks: string (nullable = true)
 |-- document: string (nullable = true)
 |-- document_len: integer (nullable = true)
 |-- fieldNotes_len: integer (nullable = true)
 |-- eventRemarks_len: integer (nullable = true)
 |-- occurrenceRemarks_len: integer (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)


In [7]:
from lib.pos_tags import PosTags
pos_tags = PosTags()

pos = pos_tags.tag(t)
print(pos)
print(type(pos))
print(type(pos[0]))
print(type(pos[0]["tag"]))
print(pos[0])


[{'tag': 'NNP', 'word': 'Hello'}, {'tag': ',', 'word': ','}, {'tag': 'PRP$', 'word': 'my'}, {'tag': 'NN', 'word': 'name'}, {'tag': 'VBZ', 'word': 'is'}, {'tag': 'NNP', 'word': 'Mace'}, {'tag': 'NNP', 'word': 'Windoo'}]
<type 'list'>
<type 'dict'>
<type 'str'>
{'tag': 'NNP', 'word': 'Hello'}

In [8]:
udf_part_of_speech = sql.udf(pos_tags.tag, types.ArrayType(
                                    types.MapType(
                                        types.StringType(),
                                        types.StringType()
                                    )
                                )
                            )

notes_w_tokens2 = notes_w_tokens.withColumn('pos', 
                                            udf_part_of_speech(notes_w_tokens['tokens']))

for r in notes_w_tokens2.select(notes_w_tokens2["pos"]).head(50):
    s = ""
    for p in r["pos"]:
        s = s + p["word"] + " (" + p["tag"] + ") | "
    print(s + "\n")


returned (VBD) | specimens (NNS) | identified (VBD) | by (IN) | RES (NNP) | 

[ (NN) | 01°44'28 (CD) | '' ('') | S (NNP) | 51°27'21.3 (CD) | '' ('') | W (NNP) | BRASIL (NNP) | : (:) | PA (NNP) | , (,) | Melgaço (NNP) | , (,) | Caxiuanã (NNP) | Estação (NNP) | Cient. (NNP) | Ferreira (NNP) | Pena (NNP) | Trilha (NNP) | , (,) | 19-22.xi.2003 (CD) | YPT (NNP) | APAguiar (NNP) | & (CC) | JDias (NNP) | P05091 (NNP) | ] (NNP) | 

Plot (NN) | : (:) | S18. (NNP) | Photo (NNP) | : (:) | yes (NNS) | . (.) | 

5 (LS) | of (IN) | 81 (CD) | specs. (NNP) | to (TO) | USNM (NNP) | , (,) | 86-3126 (CD) | ; (:) | 5 (CD) | of (IN) | 81 (CD) | specs. (NNP) | to (TO) | Kochi (NNP) | U. (NNP) | , (,) | 86-3127 (CD) | ; (:) | alcohol (NN) | specs. (NNP) | in (IN) | 2 (CD) | jars. (NNP) | Cruise (NNP) | 18B (CD) | , (,) | sta. (NNP) | 770 (CD) | ; (:) | same (JJ) | as (IN) | sta. (NNP) | LK (NNP) | 66-120 (CD) | . (.) | 

Frequent. (NNP) | In (IN) | sandy (NN) | soil (NN) | on (IN) | ridge (NN) | top (NN) | in (IN) | rough (JJ) | breaks (NNS) | with (IN) | Cercocarpus (NNP) | montanus (NN) | , (,) | Parthenium (NNP) | alpinum (NN) | and (CC) | Musineon (NNP) | . (.) | 

[ (NN) | 23°38'56.9 (CD) | '' ('') | S (NNP) | 45°53'48.8 (CD) | '' ('') | W (NNP) | , (,) | BRASIL (NNP) | : (:) | SP (NNP) | , (,) | Salesópolis (NNP) | , (,) | Res. (NNP) | , (,) | Biol. (NNP) | Boracéia (NNP) | , (,) | Trilha (NNP) | dos (NNS) | Pilões (VBZ) | , (,) | P2.2 (-NONE-) | , (,) | margem (NN) | do (VB) | rio (NN) | , (,) | 12-15.i.2003 (CD) | , (,) | YPT (NNP) | , (,) | 863m (CD) | , (,) | ACC (NNP) | Macedo (NNP) | & (CC) | JS (NNP) | Freitas (NNP) | ] (NNP) | 

Substrate (NNP) | : (:) | intact (NN) | coarse (NN) | down (IN) | wood (NN) | > (:) | 5 (CD) | centimeter (NN) | diameter (NN) | , (,) | decay (NN) | class (NN) | 1-2. (CD) | Growth (NN) | Vigor (NNP) | : (:) | non-reproductive (JJ) | and (CC) | vigorous (JJ) | . (.) | 

Small (JJ) | shrubby (NN) | tree (NN) | along (IN) | road (NN) | . (.) | 

along (IN) | permanent (NN) | stream (NN) | 

On (IN) | moist (NN) | and (CC) | diffusely (RB) | lit (NN) | boulder (NN) | . (.) | 

BMNH (NNP) | ( (:) | E (NNP) | ) (:) | 1016108 (CD) | ; (:) | Arnold (NNP) | Coll. (NNP) | B.M.1934-354 (NNP) | 

Measurements (NNS) | : (:) | 191-85-22-0 (CD) | 

Sandy (NNP) | flood (VBD) | plain (NN) | of (IN) | the (DT) | Platte (NNP) | River (NNP) | . (.) | 

locality (NN) | info (VBD) | appears (NNS) | to (TO) | be (VB) | incorrect (JJ) | ... (:) | ( (:) | on (IN) | label (NN) | ) (:) | N (NNP) | Mexico (NNP) | or (CC) | should (MD) | it (PRP) | be (VB) | National (NNP) | Monument (NNP) | ... (:) | db (NN) | 

Shrub (NNP) | 1 (CD) | m. (NNP) | Corolla (NNP) | translucent (NN) | white (NN) | 

down (IN) | wood (NN) | ; (:) | Tiffany (NNP) | field (NN) | notebook (NN) | 

Number (NNP) | in (IN) | Set (NNP) | : (:) | 1 (LS) | ; (:) | A (DT) | common (JJ) | erect (NN) | perennial (JJ) | herb (NN) | ; (:) | flowers (NNS) | brackish (VBP) | . (.) | 

original (JJ) | data (NNS) | lost (VBD) | 

Herb. (NNP) | Brasil. (NNP) | Regnell. (NNP) | Musei (NNP) | bot. (NNP) | Stockholm.\r\n\r\nExped. (NNP) | Imae (NNP) | Regnellian. (NNP) | Lichenes (NNP) | 

Rep. (NNP) | 1-B (CD) | ; (:) | genitalia (NN) | slide (NN) | 178 (CD) | [ (CD) | FLORIDA (NNP) | : (:) | Leon (NNP) | Co. (NNP) | , (,) | Tall (NNP) | TimberRes. (NNP) | Sta. (NNP) | ; (:) | added (VBD) | 12-JUL-2002 (CD) | ] (CD) | [ (CD) | 24 (CD) | IX (CD) | 69 (CD) | ; (:) | added (VBD) | 12-JUL-2002 (CD) | ] (CD) | 

under (IN) | pine (NN) | slab (NN) | at (IN) | abandoned (VBN) | sawdust (RB) | pile (JJ) | 

Field (NNP) | notes (NNS) | on (IN) | file (JJ) | . (.) | 

Handwritten (NNP) | label (NN) | indicates (VBZ) | that (IN) | specimen (NNS) | was (VBD) | part (NN) | of (IN) | the (DT) | Cross (NNP) | , (,) | Dawson (NNP) | , (,) | or (CC) | Potts (NNP) | collections (NNS) | ( (:) | fide (NN) | David (NNP) | Bettman (NNP) | ) (NNP) | . (.) | Label (NNP) | states (VBZ) | 'Ex (JJ) | larva (NN) | ' ('') | 

Additional (JJ) | data (NNS) | on (IN) | card (NN) | 

Aquatic (JJ) | with (IN) | purple (JJ) | petals (NNS) | . (.) | 

Additional (JJ) | data (NNS) | on (IN) | card (NN) | . (.) | 

Also (RB) | a (DT) | paratype (NN) | of (IN) | Helminthosporium (NNP) | glabroides (VBZ) | F. (NNP) | Stevens (NNP) | . (.) | 

Plot (NN) | C-17 (-NONE-) | . (.) | 

Frequent (NNP) | 

Standard (NNP) | OIl (NNP) | Spill (NNP) | Survey (NNP) | 1971 (CD) | 

compost (NN) | pile (VBD) | 

Plants (NNS) | of (IN) | Franklin (NNP) | District (NNP) | Northwest (NNP) | Territories (NNP) | , (,) | Canada (NNP) | 

[ (NN) | ITALY (RB) | Lomb. (NNP) | L. (NNP) | Sartirana (NNP) | Br. (NNP) | 20.VII.54 (CD) | Brivio (NNP) | ] (NNP) | 

elfin (NN) | bamboo/mixed (VBN) | subparamo (NN) | litter (NN) | berlese (NN) | forest (NN) | litter (NN) | 

hepialid (VBD) | webs (NNS) | on (IN) | trees (NNS) | 

`` (``) | Fresh (JJ) | . (.) | '' ('') | 

flight (NN) | intercept (VBD) | trap (NN) | 

Tree (NNP) | to (TO) | 3 (CD) | m (NN) | ; (:) | flowers (NNS) | cream (VBP) | ; (:) | occasional (JJ) | . (.) | 

Rare (NNP) | 

Purchased (VBN) | by (IN) | Columbia (NNP) | College (NNP) | , (,) | 1885-7 (CD) | 

[ (NN) | This (DT) | collection (NN) | is (VBZ) | stored (VBN) | in (IN) | the (DT) | indeterminate (NN) | Volvariaceae (NNP) | box (NN) | . (.) | ] (NN) | 

Originally (RB) | cataloged (VBD) | as (IN) | a (DT) | lot (NN) | , (,) | see (VB) | SU (NNP) | 2102 (CD) | . (.) | 

[ (NN) | 39°59.936'N (CD) | 83°02.809'W (CD) | USA (NNP) | : (:) | Ohio (NNP) | , (,) | Franklin (NNP) | Co. (NNP) | Columbus (NNP) | @ (NNP) | Kinnear (NNP) | Rd. (NNP) | 7-14.ix.2005 (CD) | Malaise (NNP) | , (,) | NF (NNP) | Johnson (NNP) | coll (NN) | . (.) | ] (NN) | 

savannah (NN) | forest (VBD) | flight (NN) | intercept (IN) | trap (NN) | 

Stiff (NNP) | shrub (NN) | 

From (IN) | CAS (NNP) | 6528 (CD) | . (.) | 

found (VBD) | on (IN) | Koeberlinia (NNP) | spinosa (NN) | Zucc. (NNP) | ( (NNP) | crown (NN) | of (IN) | thorns (NNS) | ) (:) | [ (:) | USDA (NNP) | ] (:) | 

Source (NN) | : (:) | MRS (NNS) | 

TC (NNP) | 715 (CD) | ( (CD) | Cekalovic (NNP) | 's (POS) | lot (NN) | number (NN) | ) (:) | 

Wells (NNS) | , (,) | Whiting (NNP) | 


In [9]:
notes_w_tokens2.printSchema()


root
 |-- uuid: string (nullable = true)
 |-- occurrenceID: string (nullable = true)
 |-- catalogNumber: string (nullable = true)
 |-- county: string (nullable = true)
 |-- institutionCode: string (nullable = true)
 |-- country: string (nullable = true)
 |-- countryCode: string (nullable = true)
 |-- stateProvince: string (nullable = true)
 |-- family: string (nullable = true)
 |-- recordedBy: string (nullable = true)
 |-- order: string (nullable = true)
 |-- specificEpithet: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- scientificName: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- fieldNotes: string (nullable = true)
 |-- occurrenceRemarks: string (nullable = true)
 |-- eventRemarks: string (nullable = true)
 |-- document: string (nullable = true)
 |-- document_len: integer (nullable = true)
 |-- fieldNotes_len: integer (nullable = true)
 |-- eventRemarks_len: integer (nullable = true)
 |-- occurrenceRemarks_len: integer (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pos: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)


In [10]:
# Can we work with maps natively?
notes_w_tokens2.select(notes_w_tokens2["pos"][0]["word"]).show(3, truncate=False)
# YES!


+------------+
|pos[0][word]|
+------------+
|returned    |
|[           |
|Plot        |
+------------+
only showing top 3 rows


In [28]:
# Let's go pipline our whole analysis
from lib.tokens import Tokens
from lib.pos_tags import PosTags
from lib.relations import Relations

t = Tokens()
p = PosTags()
r = Relations()

def pipeline(s):
    '''
    Given a string, return a list of relations
    '''
    return r.find(p.tag(t.tokenize(s)))

pipeline_udf = sql.udf(pipeline, types.ArrayType(
                                       types.MapType(
                                               types.StringType(), 
                                               types.MapType(
                                                       types.StringType(),
                                                       types.StringType()
                                                       )
                                               )
                                       )
                    )

In [29]:
relations = notes.withColumn("rels", pipeline_udf(notes["document"]))
relations.cache()
relations.select(relations["rels"]).show(3, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|rels                                                                                                                                                                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Map(p -> Map(word -> animals., tag -> NNP), s -> Map(word -> shells, tag -> NNS), v -> Map(word -> with, tag -> IN)), Map(p -> Map(word -> PD, tag -> NNP), s -> Map(word -> Vouchers, tag -> NNP), v -> Map(word -> of, tag -> IN)), Map(p -> Map(word -> Elimia, tag -> NNP), s -> Map(word -> Originally, tag -> NNP), v -> Map(word -> identified, tag -> VBD))]|
|[Map(p -> Map(word -> collector, tag -> NN), s -> Map(word -> Gift, tag -> NNP), v -> Map(word -> of, tag -> IN))]                                                                                                                                                                                                                                                   |
|[]                                                                                                                                                                                                                                                                                                                                                                   |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows


In [ ]:
# Maybe graph distribution of relations per record?

In [ ]:
# Let's at least count up how many relations we have

Leave this for later


In [11]:
# Split out words by type
# Can't figure out how to access elements of a map in a filter so 
# build something that filters the lists for us.
def find_pos(pos, part):
    '''
    Take a list of dicts that represent words tagged with
    pos information and return a list of words that match
    the requested pos
    '''
    retval = []
    for p in pos:
        if p["tag"].startswith(part):
            retval.append(p["word"])
    return retval

print(find_pos(pos, "NN"))


['Hello', 'name', 'Mace', 'Windoo']

In [12]:
# Can't figure out how to pass a single string to a UDF
find_nouns_udf = sql.udf(lambda x: find_pos(x, "NN"), types.ArrayType(types.StringType()))

In [13]:
nouns = notes_w_tokens2\
    .select(sql.explode(find_nouns_udf(notes_w_tokens2["pos"])).alias("word"))
nouns.cache()
nouns.show(3)


+---------+
|     word|
+---------+
|specimens|
|      RES|
|        [|
+---------+
only showing top 3 rows


In [14]:
noun_counts = nouns\
    .groupBy("word")\
    .count()\
    .orderBy("count", ascending=False)\
    
noun_counts.show(30)


+-----------+-----+
|       word|count|
+-----------+-----+
|          [|48044|
|          ]|32787|
|       data|16549|
|          ||11102|
|      notes|10569|
|       trap|10393|
|       card|10090|
|        See|10085|
|          (|10080|
|          S| 9586|
| collection| 9407|
|       Alch| 8578|
|     forest| 8558|
|        Co.| 8135|
|     litter| 8042|
|       soil| 7617|
|      field| 7342|
|     Number| 7096|
|     NOTEBY| 6965|
|          m| 6897|
|     flight| 6810|
|          )| 6762|
|  Herbarium| 6728|
| Collection| 6538|
|          C| 5964|
|          W| 5883|
|   specimen| 5729|
|   NOTEDATE| 5593|
|Preparation| 5074|
|preparation| 5074|
+-----------+-----+
only showing top 30 rows


In [15]:
noun_counts.cache()


Out[15]:
DataFrame[word: string, count: bigint]

In [16]:
noun_counts.orderBy(noun_counts["count"]).show(30)


+--------------------+-----+
|                word|count|
+--------------------+-----+
|            CC-79-19|    1|
|            Feathery|    1|
|                Elof|    1|
|            watshami|    1|
|           Embidobia|    1|
|           Cionthrix|    1|
|det_comments:46.0...|    1|
|           atranonin|    1|
|             rhzomes|    1|
|det_comments:23-26mm|    1|
|         malfunction|    1|
|                L541|    1|
|         Martinsburg|    1|
|           Cupidonia|    1|
|          secretary.|    1|
|               neut.|    1|
|            radulae.|    1|
|            Bekopaka|    1|
|          L.S.Dilion|    1|
|         longiusculo|    1|
|             Granids|    1|
|          Amblydoras|    1|
|                IDS.|    1|
|            Pristers|    1|
|                Reps|    1|
|          periphyton|    1|
|      W.J.Chamberlin|    1|
| Syn=Anthrocothecium|    1|
|         OSAL0094835|    1|
|                 CMB|    1|
+--------------------+-----+
only showing top 30 rows


In [17]:
noun_counts_pdf = noun_counts.limit(1000).toPandas()
print(noun_counts_pdf.head())

tuples = []
for l in noun_counts_pdf.iterrows():
    tuples.append( (l[1], l[0]) )


    word  count
0      [  48044
1      ]  32787
2   data  16549
3      |  11102
4  notes  10569

In [186]:
%matplotlib inline
from wordcloud import WordCloud

wordcloud = WordCloud().generate_from_frequencies(tuples)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-186-d1d9a105dc58> in <module>()
      2 from wordcloud import WordCloud
      3 
----> 4 wordcloud = WordCloud().generate_from_frequencies(tuples)

/usr/local/lib/python2.7/dist-packages/wordcloud/wordcloud.pyc in generate_from_frequencies(self, frequencies)
    261         """
    262         # make sure frequencies are sorted and normalized
--> 263         frequencies = sorted(frequencies, key=item1, reverse=True)
    264         frequencies = frequencies[:self.max_words]
    265         # largest entry will be 1

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in __nonzero__(self)
    712         raise ValueError("The truth value of a {0} is ambiguous. "
    713                          "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
--> 714                          .format(self.__class__.__name__))
    715 
    716     __bool__ = __nonzero__

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [ ]:
# And some verbs