In [1]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import numpy as np
import folium
from folium import plugins
from IPython.display import display
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
def aggregate_term(clusterobj,
term,
businesscounty=None):
""" Computes a term aggregation
Args;
clusterobj: Elasticsearch cluster handle
term: String that stores a term
businesscounty: (Optional) Refers to a specific county
Returns:
term_aggregation: List of tuples that stores a term
aggregation"""
if businesscounty is None:
searchobj = Search(using=clusterobj,
index="texasopendata",
doc_type="tdlr")
else:
searchobj =\
Search(using=clusterobj,
index="texasopendata",
doc_type="tdlr").query('match',
businesscounty=businesscounty)
aggregation_name = 'by_' + term
searchobj.aggs.bucket(aggregation_name,
'terms',
field=term,
size=0)
resultsobj = searchobj.execute()
term_aggregation = []
for elem in resultsobj.aggregations[aggregation_name].buckets:
term_aggregation.append((elem['doc_count'], elem['key']))
return term_aggregation
def get_term_counts(term_aggregation):
""" Returns a list of term counts
Args:
term_aggregation: List of tuples that stores a term aggregation
Returns:
term_count: List of term counts"""
return[elem[0] for elem in term_aggregation]
def compute_cumulativecount_statistic(term_aggregation):
""" Computes a cumulative count statistic that is analagous
to a Cumulative Distribution Function (CDF)
Args:
term_aggregation: List of tuples that stores a term aggregation
Returns:
cumulativecount_statistic: Cumulative count statistic that is
analagous to a CDF"""
normalized_counts = get_term_counts(term_aggregation)
total_counts = np.sum(normalized_counts)
normalized_counts = normalized_counts / total_counts
return np.cumsum(normalized_counts)
def eval_term_percentcontribution(term,
term_aggregation,
cumulativecount_statistic,
threshold):
""" Estimates the percent contribution of each unique term instance
Args:
term: String that refers to a specific term
term_aggregation: List of tuples that stores a term aggregation
cumulativecount_statistic: Cumulative count statistic that is
analagous to a CDF
threshold: Cumulative count statistic threshold"""
threshold_idx = int(np.argwhere(bylicensetype_cdf >= threshold)[0])
print("%d %s span %g%% of all %s" % (threshold_idx+1,
term,
100*threshold,
term))
print("")
total_counts = np.sum(get_term_counts(term_aggregation))
print("Total counts:%d" % (total_counts))
print("")
term_aggregation = term_aggregation[:threshold_idx+1]
percent_contribution = [(np.round(100.0*elem[0] / float(total_counts), 1),
elem[1]) for elem in term_aggregation]
for elem in percent_contribution:
print("%s: %g%% of total counts" % (elem[1], elem[0]))
def generate_licensetype_heatmap(clusterobj,
licensetype,
businesscounty):
""" Generates a licensetype heatmap for a specific businesscounty
Args:
clusterobj: Elasticsearch cluster handle
licensetype: String that refers to a specific license type
businesscounty: String that refers to a specific businesscounty"""
searchobj = Search(using=clusterobj,
index="texasopendata",
doc_type="tdlr"
).filter(
"term", businesscounty=businesscounty
).filter(
"term", licensetype=licensetype
)
mailingaddresslocation = []
for elem in searchobj.scan():
if elem.mailingaddresslocation['lat'] != -1 and\
elem.mailingaddresslocation['lat'] != -1:
mailingaddresslocation.append((elem.mailingaddresslocation['lat'],
elem.mailingaddresslocation['lon']))
center_latitude = np.mean([elem[0] for elem in mailingaddresslocation])
center_longitude = np.mean([elem[1] for elem in mailingaddresslocation])
map_osm = folium.Map(location=[center_latitude, center_longitude])
map_osm.add_children(plugins.HeatMap(mailingaddresslocation,
min_opacity=0.1))
display(map_osm)
map_osm.save(re.sub(' ', '', licensetype) + "_" + businesscounty + ".html")
In [3]:
clusterobj = Elasticsearch('localhost:9200')
In [4]:
term = 'licensetype'
threshold = 0.9
bylicensetype = aggregate_term(clusterobj, term)
bylicensetype_cdf = compute_cumulativecount_statistic(bylicensetype)
eval_term_percentcontribution(term,
bylicensetype,
bylicensetype_cdf,
threshold)
In [5]:
sns.set(font_scale=1.5)
plt.plot(100*bylicensetype_cdf)
plt.xlabel('Number of unique license types')
plt.ylabel('Percentage of total licenses')
plt.title('TDLR License Type Cumulative Statistic')
plt.savefig('licensetype.png')
In [6]:
term = 'businesscounty'
threshold = 0.9
bybusinesscounty = aggregate_term(clusterobj, term)
bybusinesscounty_cdf = compute_cumulativecount_statistic(bybusinesscounty)
eval_term_percentcontribution(term,
bybusinesscounty,
bybusinesscounty_cdf,
threshold)
In [7]:
sns.set(font_scale=1.5)
plt.plot(100*bybusinesscounty_cdf)
plt.xlabel('Number of unique business counties')
plt.ylabel('Percentage of total business counties')
plt.title('TDLR Business County Cumulative Statistic')
plt.savefig('businesscounty.png')
In [8]:
term = 'licensetype'
businesscounty = 'HARRIS'
threshold = 0.9
bylicensetype = aggregate_term(clusterobj,
term,
businesscounty)
bylicensetype_cdf = compute_cumulativecount_statistic(bylicensetype)
eval_term_percentcontribution(term,
bylicensetype,
bylicensetype_cdf,
threshold)
In [9]:
generate_licensetype_heatmap(clusterobj,
'Cosmetology Operator',
'HARRIS')
In [10]:
generate_licensetype_heatmap(clusterobj,
'Apprentice Electrician',
'HARRIS')
In [ ]: