In [2]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import pyspark.ml.feature as feature
In [3]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
notes_w_rels = sqlContext.read.parquet("../data/idigbio_relations.parquet")
total_records = notes_w_rels.count()
print(total_records)
#notes_w_rels.printSchema()
In [4]:
relations = notes_w_rels\
.filter(sql.size(sql.col("rels")) > 0)\
# Small sample of the df
#relations = relations.sample(withReplacement=False, fraction=0.1)
relations.cache()
print(relations.count())
In [5]:
print (relations.count() / float(total_records))
In [6]:
# And how many relations are there total?
relations\
.select(sql.sum(sql.size(sql.col("rels"))))\
.show()
In [7]:
# Maybe graph distribution of relations per record?
In [8]:
# cross-tabby list of everything
just_rels = relations\
.select("uuid", sql.explode("rels").alias("triple"))\
.select(sql.col("uuid"),
sql.col("triple")["s"]["word"].alias("subject"),
sql.col("triple")["v"]["word"].alias("verb"),
sql.col("triple")["p"]["word"].alias("predicate"))
just_rels.show()
In [9]:
# Now just unique nouns that we need to look up
nouns = relations\
.select(sql.explode("rels").alias("triple"))\
.select(sql.lower(sql.col("triple")["s"]["word"]).alias("noun"))\
.unionAll(
relations\
.select(sql.explode("rels").alias("triple"))\
.select(sql.lower(sql.col("triple")["p"]["word"]).alias("noun"))
)\
.dropna()\
.groupBy("noun")\
.count()\
.orderBy("count", ascending=False)
In [10]:
num_nouns = nouns.count()
nouns.describe().show()
nouns.show(30)
In [11]:
# Median noun count
import math
nouns\
.select(sql.col("count"))\
.limit(int(math.floor(num_nouns/2)))\
.orderBy(sql.col("count"))\
.show(3)
In [12]:
# ok, how many have more than 1?
nouns\
.filter(sql.col("count") > 1)\
.count()
Out[12]:
In [14]:
# distribution of noun frequency
nouns_pdf = nouns.limit(10000).toPandas()
In [16]:
import seaborn as sns
ticks = range(0, 10001, 1000)
ax = sns.barplot(x="noun", y="count", data=nouns_pdf, color="black")
ax.set_yscale('log')
ax.get_xaxis().set_ticks(ticks)
ax.get_xaxis().set_ticklabels(ticks)
ax.set_title("Distribution of Most Common 10,000 Nouns Found in Relations")
ax.set_ylabel("Count (Log scale)")
ax.set_xlabel("Rank of Unique Nouns")
Out[16]:
In [17]:
# let's make some candidates to try looking up.
# more than 1 occurence seems to trim off a lot
nouns\
.filter(sql.col("count") > 1)\
.filter(sql.length(sql.col("noun")) > 2)\
.count()
Out[17]:
In [ ]:
# write those out
nouns\
.filter(sql.col("count") > 1)\
.filter(sql.length(sql.col("noun")) > 2)\
.write.format("json").save("../data/idigbio_nouns.json")
In [ ]: