In [46]:
from itertools import combinations
import jellyfish
from scipy.cluster import hierarchy
import numpy as np
import matplotlib.pyplot as plt
In [47]:
# Strings to compare
strs = [u"MARTHA", u"MARHTA", u"DWAYNE", u"DUANE", u"DIXON", u"DICKSONX"]
In [48]:
# Calculating Jaro similarity and converting to distance (use Jaro-Winkler below)
jaro_dists = [1 - jellyfish.jaro_distance(x,y) for x,y in combinations(strs, 2)]
In [49]:
jaro_dists
Out[49]:
In [50]:
# Plot it
ytdist = np.array(jaro_dists)
Z = hierarchy.linkage(ytdist, 'single')
plt.figure()
hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
dn = hierarchy.dendrogram(Z, above_threshold_color='#bcbddc', orientation='left', labels=strs)
hierarchy.set_link_color_palette(None) # reset to default after use
plt.show()
In [63]:
# I copied these from the AWS console. If anyone knows the AWS API endpoint to get an equivalent list, let me know!
strs = [
u"API Gateway",
u"Application Discovery Service",
u"AppStream",
u"AppStream 2.0",
u"Athena",
u"AWS IoT",
u"Certificate Manager",
u"CloudFormation",
u"CloudFront",
u"CloudSearch",
u"CloudTrail",
u"CloudWatch",
u"CodeBuild",
u"CodeCommit",
u"CodeDeploy",
u"CodePipeline",
u"Cognito",
u"Compliance Reports",
u"Config",
u"Data Pipeline",
u"Device Farm",
u"Direct Connect",
u"Directory Service",
u"DMS",
u"DynamoDB",
u"EC2",
u"EC2 Container Service",
u"Elastic Beanstalk",
u"Elastic File System",
u"Elastic Transcoder",
u"ElastiCache",
u"Elasticsearch Service",
u"EMR",
u"GameLift",
u"Glacier",
u"IAM",
u"Inspector",
u"Kinesis",
u"Lambda",
u"Lex",
u"Lightsail",
u"Machine Learning",
u"Managed Services",
u"Mobile Analytics",
u"Mobile Hub",
u"OpsWorks",
u"Pinpoint",
u"Polly",
u"QuickSight",
u"RDS",
u"Redshift",
u"Rekognition",
u"Route 53",
u"S3",
u"Server Migration",
u"Service Catalog",
u"SES",
u"Snowball",
u"SNS",
u"SQS",
u"Step Functions",
u"Storage Gateway",
u"SWF",
u"Trusted Advisor",
u"VPC",
u"WAF & Shield",
u"WorkDocs",
u"WorkMail",
u"WorkSpaces"
]
In [64]:
# Calculate similarity and convert to distance
jaro_dists = [1 - jellyfish.jaro_winkler(x,y) for x,y in combinations(strs, 2)]
In [80]:
ytdist = np.array(jaro_dists)
Z = hierarchy.linkage(ytdist, 'single')
plt.figure(figsize=(6, 10), facecolor='white')
# The colors don't mean anything; anything below the color_threshold uses one of these colors
plt.suptitle('Jaro-Winkler Similarity of AWS Service Names', y=.94, fontsize=16)
plt.title('github.com/agussman | T:@percontate', fontsize=10)
hierarchy.set_link_color_palette(['g', 'r', 'm', 'c'])
dn = hierarchy.dendrogram(Z, color_threshold=0.25, above_threshold_color='#bcbddc', orientation='left', labels=strs)
hierarchy.set_link_color_palette(None) # reset to default after use
plt.show()
In [ ]: