In [24]:
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
In [25]:
df = (sqlContext.read.load("/guoda/data/idigbio-20190612T171757.parquet")
)
Calculate features
In [26]:
from pyspark.ml.linalg import Vectors, VectorUDT
#data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
# (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
#print(spark.createDataFrame(data, ["features"]).head(10))
In [27]:
from pyspark.sql.functions import col, length, udf
import re
hex_chars = re.compile(r'[0-9a-fA-F]')
special_chars = re.compile(r'[\:\\\-_ ]')
def make_features(occurrenceid):
return Vectors.dense(
len(occurrenceid),
len(re.findall(hex_chars, occurrenceid)),
len(re.findall(special_chars, occurrenceid))
)
#return [float(len(occurrenceid)), 5.0]
make_features_udf = udf(make_features, VectorUDT())
# length(col("occurrenceid")
features = (df
.select(col("occurrenceid"),
make_features_udf(col("occurrenceid")).alias("features"))
.persist()
)
#features = df.rdd.map(lambda x: Vectors.dense(len(x["occurrenceid"])))
In [28]:
features.head(10)
Out[28]:
In [29]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=4, seed=1)
model = kmeans.fit(features)
In [ ]:
In [ ]:
In [ ]:
In [30]:
centers = model.clusterCenters()
print(centers)
In [31]:
import pandas as pd
centers_df = pd.DataFrame(centers)
centers_df
Out[31]:
In [32]:
# graph centers
In [ ]:
In [33]:
# what are the most important axes? pca
In [34]:
# select those dimensions from graph centers and plot in 2 dimensions
That's cool, how do we know 4 is a good k?
In [35]:
# parallel arrays for the win
ks = []
models = []
centers = []
costs = []
for i in range(5, 21):
kmeans = KMeans(k=i, seed=1)
model = kmeans.fit(features)
ks.append(i)
models.append(model)
centers.append(model.clusterCenters())
costs.append(model.computeCost(features))
In [36]:
# predict as a UDF to run on all data to find examples of things in the clusters
In [37]:
#print(models)
print(costs)
In [38]:
plt.plot(ks, costs)
Out[38]:
In [39]:
final_k = 15
In [40]:
centers_df = pd.DataFrame(centers[ks.index(final_k)])
centers_df
Out[40]:
Need to count number of records for each cluster
In [41]:
final_model = models[ks.index(final_k)]
#.predict([51, 17, 1.1])
final_model.summary.predictions.head(10)
Out[41]:
In [42]:
sizes = (final_model.summary.predictions
.groupBy(col("prediction"))
.count()
.orderBy(col("prediction"))
.toPandas()
)
max_size = 1000
sizes["scaled"] = (sizes["count"] / sizes["count"].max() ) * max_size
In [43]:
sizes.head(10)
Out[43]:
In [44]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=centers_df[0], ys=centers_df[1], zs=centers_df[2], s=sizes["scaled"])
plt.title("Relative Cluster Sizes of Occurrence IDs in iDigBio")
plt.xlabel("Length")
plt.ylabel("Hex Characters")
ax.set_zlabel("Special Characters")
Out[44]:
lets take a look at a sampling of these clusters
In [46]:
for i in [0, 1, 2, 3]:
print("Cluster {0}".format(i))
print(final_model.summary.predictions
.filter(col("prediction") == i)
.select(col("occurrenceid"))
.limit(5)
.toPandas()
)