In [10]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import pyspark.ml.feature as feature
In [11]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
phrases = sqlContext.read.parquet("../data/idigbio_phrases.parquet")\
.sort(sql.col("count"), ascending=False)
total_records = phrases.count()
print(total_records)
phrases.printSchema()
In [3]:
# our job didn't keep the total number of phrases found so we'll have to add that up
phrases.select(sql.sum(sql.col("count"))).show()
In [4]:
phrases.describe().show()
phrases.show(30, truncate=False)
In [5]:
# Median phrase count
import math
phrases\
.select(sql.col("count"))\
.limit(int(math.floor(total_records/2)))\
.orderBy(sql.col("count"))\
.show(3)
In [6]:
# ok, how many have more than 1?
phrases\
.filter(sql.col("count") > 1)\
.count()
Out[6]:
In [7]:
# distribution of phrase frequency
phrases_pdf = phrases.limit(10000).toPandas()
In [8]:
import seaborn as sns
ax = sns.barplot(x="phrase", y="count", data=phrases_pdf, color="black")
ticks = range(0, 10001, 2000)
ax.set_yscale('log')
ax.get_xaxis().set_ticks(ticks)
ax.get_xaxis().set_ticklabels(ticks)
ax.set_title("Distribution of Noun Phrases Found")
ax.set_ylabel("Count (Log scale)")
ax.set_xlabel("Rank of Unique Noun Phrases")
Out[8]:
In [15]:
# write frequently used ones out
# Change col names to match nouns so we can use same OLS script
phrases\
.filter(sql.col("count") > 1)\
.select(sql.col("phrase").alias("noun"), sql.col("count"))\
.write.format("json").save("../data/idigbio_phrases.json")
In [12]:
# Have done the lookups, now analize just the phrases we looked up
# Load Processed Parquet
ols = sqlContext.read.json("../data/idigbio_envo_terms.json")
ols = ols\
.filter(sql.col("time") > "2016-04-19T00:00:00.00")
total_records = ols.count()
print(total_records)
#ols.printSchema()
In [13]:
print(ols.filter(sql.col("term_id") != "").count())
print(ols.filter(sql.col("term_id") == "").count())
In [14]:
# What kinds of things were we finding?
terms = ols\
.dropna(subset=["term_id"])\
.filter(sql.col("term_id") != "")\
.groupBy(sql.col("term_id"), sql.col("term_label"))\
.count()\
.orderBy(sql.col("count"), ascending=False)
print(terms.count())
terms.show(20, truncate=False)
In [18]:
# Manually review a random sample of positive results and negative results
# Different random sample every time run
ols\
.filter(sql.col("term_id") != "")\
.sample(withReplacement=False, fraction=0.01)\
.unionAll(
ols\
.filter(sql.col("term_id") == "")\
.sample(withReplacement=False, fraction=0.02)
)\
.select(sql.col("noun"), sql.col("term_label"))\
.show(1000, truncate=False)
In [ ]: