Experimenting with chunking classifier


In [3]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [31]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
notes = sqlContext.read.parquet("../data/idigbio_notes.parquet")
total_records = notes.count()
print(total_records)
# Small sample of the df
notes = notes.sample(withReplacement=False, fraction=0.01)
notes.cache()
print(notes.count())


3230857
32028

In [32]:
from lib.tokens import Tokens
tokens = Tokens()
udf_tokenize = sql.udf(tokens.tokenize, types.ArrayType(types.StringType()))
notes_w_tokens = notes.withColumn('tokens', udf_tokenize(notes['document']))
notes_w_tokens\
    .select(sql.col("tokens"))\
    .show(5, truncate=False)


+------------------------------------------------------------------------------------------+
|tokens                                                                                    |
+------------------------------------------------------------------------------------------+
|[Lloydia, 1965, 28, :, 125, .]                                                            |
|[SEE, GVF, REG., 0191, FOR, MORE, DATA, See, GVF, 191, (, sta., 70, ), for, more, data, .]|
|[flight, intercept, trap]                                                                 |
|[Mixed, live, oak, and, blue, oak, with, chapparal, scrub]                                |
|[BMNH, (, E, ), 1013748]                                                                  |
+------------------------------------------------------------------------------------------+
only showing top 5 rows


In [33]:
from lib.pos_tags import PosTags
pos_tags = PosTags()
udf_part_of_speech = sql.udf(pos_tags.tag, types.ArrayType(
                                    types.MapType(
                                        types.StringType(),
                                        types.StringType()
                                    )
                                )
                            )

notes_w_tokens2 = notes_w_tokens.withColumn('pos', 
                                            udf_part_of_speech(notes_w_tokens['tokens']))
notes_w_tokens2\
    .select(sql.col("pos"))\
    .show(5, truncate=False)


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pos                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Map(word -> Lloydia, tag -> NNP), Map(word -> 1965, tag -> CD), Map(word -> 28, tag -> CD), Map(word -> :, tag -> :), Map(word -> 125, tag -> CD), Map(word -> ., tag -> .)]                                                                                                                                                                                                                                                                                                                                                                       |
|[Map(word -> SEE, tag -> NNP), Map(word -> GVF, tag -> NNP), Map(word -> REG., tag -> NNP), Map(word -> 0191, tag -> CD), Map(word -> FOR, tag -> NNP), Map(word -> MORE, tag -> NNP), Map(word -> DATA, tag -> NNP), Map(word -> See, tag -> NNP), Map(word -> GVF, tag -> NNP), Map(word -> 191, tag -> CD), Map(word -> (, tag -> CD), Map(word -> sta., tag -> NNP), Map(word -> 70, tag -> CD), Map(word -> ), tag -> CD), Map(word -> for, tag -> IN), Map(word -> more, tag -> JJR), Map(word -> data, tag -> NNS), Map(word -> ., tag -> .)]|
|[Map(word -> flight, tag -> NN), Map(word -> intercept, tag -> VBD), Map(word -> trap, tag -> NN)]                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
|[Map(word -> Mixed, tag -> NNP), Map(word -> live, tag -> VBP), Map(word -> oak, tag -> NN), Map(word -> and, tag -> CC), Map(word -> blue, tag -> JJ), Map(word -> oak, tag -> NN), Map(word -> with, tag -> IN), Map(word -> chapparal, tag -> JJ), Map(word -> scrub, tag -> NN)]                                                                                                                                                                                                                                                                |
|[Map(word -> BMNH, tag -> NNP), Map(word -> (, tag -> :), Map(word -> E, tag -> NNP), Map(word -> ), tag -> :), Map(word -> 1013748, tag -> CD)]                                                                                                                                                                                                                                                                                                                                                                                                    |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows


In [11]:
# Write out this small set of tagged records to build a training set by manually adding a iob tag
notes_w_tokens2.write.json("../data/chunker_pre_training.json")

In [44]:
# Ok, with that training data we've got a chunker developed
from lib.chunks import Chunks
chunker = Chunks()
training_data = chunker.load_training_data("../data/chunker_training_50_fixed.json")
chunker.train(training_data)

def make_phrases(s):
    return chunker.assemble(chunker.tag(s))

make_phrases_udf = sql.udf(make_phrases, types.ArrayType(types.MapType(
            types.StringType(), types.StringType()
        )))

In [46]:
phrases = notes_w_tokens2\
    .withColumn("phrases", make_phrases_udf(sql.col("pos")))\
    .select(sql.explode(sql.col("phrases")).alias("text"))\
    .filter(sql.col("text")["tag"] == "NP")\
    .select(sql.lower(sql.col("text")["phrase"]).alias("phrase"))\
    .groupBy(sql.col("phrase"))\
    .count()


print(phrases.count())
phrases.select(sql.col("phrase")).show(50, truncate=False)


191
+---------------------------------------------------+
|phrase                                             |
+---------------------------------------------------+
|eucalyptus oil                                     |
|ridge litter                                       |
|forest flight intercept trap                       |
|mycoportal does recognize                          |
|may be wrong                                       |
|forest litter                                      |
|field label                                        |
|blacklight trap                                    |
|flor cilidro                                       |
|fleshy terrestrial polypore                        |
|lip pale blue                                      |
|de color guinda                                    |
|litter montane evergreen forest litter             |
|see station sheet                                  |
|sandy rivershore                                   |
|slide collection                                   |
|purple to dark lavender                            |
|pit trap                                           |
|engelmannii- abies lasiocarpa stand                |
|mercury vapor lite                                 |
|spruce bog                                         |
|center clearing                                    |
|caudal coloration                                  |
|gland chemestry                                    |
|field notes                                        |
|pale lavender                                      |
|montane berlese forest litter                      |
|returned to river                                  |
|and bristly leaves                                 |
|invertebrate collection                            |
|mountain forest/ cloud forest flight intercept trap|
|atrraction trap                                    |
|malaise trap                                       |
|cloud forest litter                                |
|attracted to wet sand                              |
|past flowering                                     |
|flor color lila con petalo inferior blanco         |
|carrion trap                                       |
|stream valley                                      |
|hard to read                                       |
|elfin forest litter                                |
|red to dim gray                                    |
|turkey nest                                        |
|sheehy/aug2003 see station sheet                   |
|rain forest flight intercept trap                  |
|rye grass                                          |
|montane forest litter                              |
|morado obscuro                                     |
|premontane berlese forest litter                   |
|leathery polypore                                  |
+---------------------------------------------------+
only showing top 50 rows


In [ ]:


In [ ]: