In [1]:
import csv
import re
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import seaborn as sns
%matplotlib inline
In [2]:
h_file = open("./serviceCodesCount.tsv","r")
code_name_map = {}
code_histogram = {}
patternobj = re.compile('^([0-9a-z]+)\s\|\s([0-9a-z\s]+)$')
for fields in csv.reader(h_file, delimiter="\t"):
matchobj = patternobj.match(fields[0])
cur_code = matchobj.group(1)
code_name_map[cur_code] = matchobj.group(2)
code_histogram[cur_code] = float(fields[1])
h_file.close()
In [3]:
total_count_fraction = code_histogram.values()
total_count_fraction.sort()
total_count_fraction = total_count_fraction[::-1]
total_count_fraction /= np.sum(total_count_fraction)
total_count_fraction = np.cumsum(total_count_fraction)
sns.set(font_scale=2)
f,h_ax = plt.subplots(1,2,figsize=(12,6))
h_ax[0].bar(range(0,len(code_histogram.values())),
code_histogram.values())
h_ax[0].set_xlim((0,len(total_count_fraction)))
h_ax[0].set_xlabel('Service Code #')
h_ax[0].set_ylabel('Service Code Count')
h_ax[0].set_title('Cincinnati 311\nService Code Histogram')
h_ax[1].plot(total_count_fraction, linewidth=4)
h_ax[1].set_xlim((0,len(total_count_fraction)))
h_ax[1].set_xlabel('Sorted Service Code #')
h_ax[1].set_ylabel('Total Count Fraction')
f.tight_layout()
plt.savefig("./cincinatti311Stats.png")
In [4]:
from nltk.stem.snowball import SnowballStemmer
def tokenize(text):
""" Extracts unigrams (i.e. words) from a string that contains
a service code name.
Args:
text: String that stores a service code name
Returns:
filtered_tokens: List of words contained in a service code name"""
tokens = [word.lower() for word in nltk.word_tokenize(text)]
filtered_tokens =\
filter(lambda elem: re.match('^[a-z]+$', elem) != None,
tokens)
filtered_tokens =\
map(lambda elem: re.sub("\s+"," ", elem),
filtered_tokens)
return filtered_tokens
def tokenize_and_stem(text):
""" Applies the Snowball stemmer to unigrams (i.e. words) extracted
from a string that contains a service code name.
Args:
text: String that stores a service code name
Returns:
filtered_tokens: List of words contained in a service code name"""
stemmer = SnowballStemmer('english')
tokens = [word.lower() for word in nltk.word_tokenize(text)]
filtered_tokens =\
filter(lambda elem: re.match('^[a-z]+$', elem) != None,
tokens)
filtered_tokens =\
map(lambda elem: re.sub("\s+"," ", elem),
filtered_tokens)
filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]
return filtered_tokens
def compute_tfidf_features(code_name_map,
tokenizer,
params):
""" Constructs a Term Frequency Inverse Document Frequency (TF-IDF)
matrix for the Cincinnati 311 service code names.
Args:
code_name_map: Dictionary that stores the mapping of service
codes to service names
tokenizer: Function that transforms a string into a list of
words
params: Dictionary that stores parameters that configure the
TfidfVectorizer class constructor
- mindocumentcount: Minimum number of term occurrences
in separate service code names
- maxdocumentfrequency: Maximum document frequency
Returns:
Tuple that stores a TF-IDF matrix and a TfidfVectorizer class
object.
Index: Description:
----- -----------
0 TF-IDF matrix
1 TfidfVectorizer class object"""
token_count = 0
for key in code_name_map.keys():
token_count += len(tokenize(code_name_map[key]))
num_codes = len(code_name_map.keys())
min_df = float(params['mindocumentcount']) / num_codes
tfidf_vectorizer =\
TfidfVectorizer(max_df=params['maxdocumentfrequency'],
min_df=min_df,
stop_words = 'english',
max_features = token_count,
use_idf=True,
tokenizer=tokenizer,
ngram_range=(1,1))
tfidf_matrix =\
tfidf_vectorizer.fit_transform(code_name_map.values())
return (tfidf_matrix,
tfidf_vectorizer)
def cluster_311_services(tfidf_matrix,
num_clusters,
random_seed):
"""Applies the K-means algorithm to cluster Cincinnati 311 service
codes based on their service name Term Frequency Inverse Document
Frequency (TF-IDF) feature vector.
Args:
tfidf_matrix: Cincinnati 311 service names TF-IDF feature matrix
num_clusters: K-means algorithm number of clusters input
random_seed: K-means algorithm random seed input:
Returns:
clusterid_code_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service code
clusterid_name_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service name"""
km = KMeans(n_clusters = num_clusters,
random_state=np.random.RandomState(seed=random_seed))
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
clusterid_code_map = defaultdict(list)
clusterid_name_map = defaultdict(list)
codes = code_name_map.keys()
names = code_name_map.values()
for idx in range(0, len(codes)):
clusterid_code_map[clusters[idx]].append(codes[idx])
clusterid_name_map[clusters[idx]].append(names[idx])
return (clusterid_code_map,
clusterid_name_map)
def compute_clusterid_totalcounts(clusterid_code_map,
code_histogram):
""" Computes the total Cincinnati 311 requests / service
names cluster
Args:
clusterid_code_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service code
code_histogram: Dictionary that stores the number of
occurrences for each Cincinnati 311 service
code
Returns:
clusterid_total_count: Dictionary that stores the total
Cincinnati 311 requests / service
names cluster"""
clusterid_total_count = defaultdict(int)
num_clusters = len(clusterid_code_map.keys())
for cur_cluster_id in range(0, num_clusters):
for cur_code in clusterid_code_map[cur_cluster_id]:
clusterid_total_count[cur_cluster_id] +=\
code_histogram[cur_code]
return clusterid_total_count
def print_cluster_stats(clusterid_name_map,
clusterid_total_count):
""" Prints the total number of codes and total requests count
for each Cincinnati 311 service names cluster.
Args:
clusterid_name_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service name
clusterid_total_count: Dictionary that stores the total
Cincinnati 311 requests / service
names cluster
Returns:
None"""
num_clusters = len(clusterid_total_count.keys())
for cur_cluster_id in range(0, num_clusters):
print "clusterid %d | # of codes: %d | total count: %d" %\
(cur_cluster_id,
len(clusterid_name_map[cur_cluster_id]),
clusterid_total_count[cur_cluster_id])
def eval_maxcount_clusterid(clusterid_code_map,
clusterid_total_count,
code_histogram):
""" This function performs the following two operations:
1.) Plots the requests count for each service name in the
maximum count service names cluster.
2. Prints the maximum count service name in the maximum count
service names cluster
Args:
clusterid_name_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service name
clusterid_total_count: Dictionary that stores the total
Cincinnati 311 requests / service
names cluster
code_histogram: Dictionary that stores the number of
occurrences for each Cincinnati 311 service
code
Returns:
None"""
num_clusters = len(clusterid_code_map.keys())
contains_multiple_codes = np.empty(num_clusters, dtype=bool)
for idx in range(0, num_clusters):
contains_multiple_codes[idx] = len(clusterid_code_map[idx]) > 1
filtered_clusterid =\
np.array(clusterid_total_count.keys())
filtered_total_counts =\
np.array(clusterid_total_count.values())
filtered_clusterid =\
filtered_clusterid[contains_multiple_codes]
filtered_total_counts =\
filtered_total_counts[contains_multiple_codes]
max_count_idx = np.argmax(filtered_total_counts)
maxcount_clusterid = filtered_clusterid[max_count_idx]
cluster_code_counts =\
np.zeros(len(clusterid_code_map[maxcount_clusterid]))
for idx in range(0, len(cluster_code_counts)):
key = clusterid_code_map[maxcount_clusterid][idx]
cluster_code_counts[idx] = code_histogram[key]
plt.bar(range(0,len(cluster_code_counts)),cluster_code_counts)
plt.grid(True)
plt.xlabel('Service Code #')
plt.ylabel('Service Code Count')
plt.title('Cluster #%d Service Code Histogram' %\
(maxcount_clusterid))
max_idx = np.argmax(cluster_code_counts)
print "max count code: %s" %\
(clusterid_code_map[maxcount_clusterid][max_idx])
def add_new_cluster(from_clusterid,
service_code,
clusterid_total_count,
clusterid_code_map,
clusterid_name_map):
"""Creates a new service name(s) cluster
Args:
from_clusterid: Integer that refers to a service names
cluster that is being split
servicecode: String that refers to a 311 service code
clusterid_code_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service code
clusterid_name_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service name
Returns:
None - Service names cluster data structures are updated
in place"""
code_idx =\
np.argwhere(np.array(clusterid_code_map[from_clusterid]) ==\
service_code)[0][0]
service_name = clusterid_name_map[from_clusterid][code_idx]
next_clusterid = (clusterid_code_map.keys()[-1])+1
clusterid_code_map[from_clusterid] =\
filter(lambda elem: elem != service_code,
clusterid_code_map[from_clusterid])
clusterid_name_map[from_clusterid] =\
filter(lambda elem: elem != service_name,
clusterid_name_map[from_clusterid])
clusterid_code_map[next_clusterid] = [service_code]
clusterid_name_map[next_clusterid] = [service_name]
def print_clustered_servicenames(cur_clusterid,
clusterid_name_map):
"""Prints the Cincinnati 311 service names(s) for a specific
Cincinnati 311 service names cluster
Args:
cur_clusterid: Integer that refers to a specific Cincinnati 311
service names cluster
clusterid_name_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service name"""
for cur_name in clusterid_name_map[cur_clusterid]:
print "%s" % (cur_name)
def plot_cluster_stats(clusterid_code_map,
clusterid_total_count):
"""Plots the following service name(s) cluster statistics:
- Number of service code(s) / service name(s) cluster
- Total number of requests / service name(s) cluster
Args:
clusterid_name_map: Dictionary that stores the mapping of
cluster identifier to Cincinnati 311
service name
clusterid_total_count: Dictionary that stores the total
Cincinnati 311 requests / service
names cluster
Returns:
None"""
codes_per_cluster =\
map(lambda elem: len(elem), clusterid_code_map.values())
num_clusters = len(codes_per_cluster)
f,h_ax = plt.subplots(1,2,figsize=(12,6))
h_ax[0].bar(range(0,num_clusters), codes_per_cluster)
h_ax[0].set_xlabel('Service Name(s) cluster id')
h_ax[0].set_ylabel('Number of service codes / cluster')
h_ax[1].bar(range(0,num_clusters), clusterid_total_count.values())
h_ax[1].set_xlabel('Service Name(s) cluster id')
h_ax[1].set_ylabel('Total number of requests')
plt.tight_layout()
In [5]:
params = {'maxdocumentfrequency': 0.25,
'mindocumentcount': 10}
(tfidf_matrix,
tfidf_vectorizer) = compute_tfidf_features(code_name_map,
tokenize,
params)
print "# of terms: %d" % (tfidf_matrix.shape[1])
print tfidf_vectorizer.get_feature_names()
In [6]:
num_clusters = 20
kmeans_seed = 3806933558
(clusterid_code_map,
clusterid_name_map) = cluster_311_services(tfidf_matrix,
num_clusters,
kmeans_seed)
clusterid_total_count =\
compute_clusterid_totalcounts(clusterid_code_map,
code_histogram)
print_cluster_stats(clusterid_name_map,
clusterid_total_count)
In [7]:
eval_maxcount_clusterid(clusterid_code_map,
clusterid_total_count,
code_histogram)
In [8]:
params = {'maxdocumentfrequency': 0.25,
'mindocumentcount': 10}
(tfidf_matrix,
tfidf_vectorizer) = compute_tfidf_features(code_name_map,
tokenize_and_stem,
params)
print "# of terms: %d" % (tfidf_matrix.shape[1])
print tfidf_vectorizer.get_feature_names()
In [9]:
num_clusters = 20
kmeans_seed = 3806933558
(clusterid_code_map,
clusterid_name_map) = cluster_311_services(tfidf_matrix,
num_clusters,
kmeans_seed)
clusterid_total_count =\
compute_clusterid_totalcounts(clusterid_code_map,
code_histogram)
print_cluster_stats(clusterid_name_map,
clusterid_total_count)
plot_cluster_stats(clusterid_code_map,
clusterid_total_count)
In [10]:
eval_maxcount_clusterid(clusterid_code_map,
clusterid_total_count,
code_histogram)
In [11]:
add_new_cluster(1,
'mtlfrn',
clusterid_total_count,
clusterid_code_map,
clusterid_name_map)
In [12]:
clusterid_total_count =\
compute_clusterid_totalcounts(clusterid_code_map,
code_histogram)
print_cluster_stats(clusterid_name_map,
clusterid_total_count)
In [13]:
eval_maxcount_clusterid(clusterid_code_map,
clusterid_total_count,
code_histogram)
In [14]:
add_new_cluster(1,
'ydwstaj',
clusterid_total_count,
clusterid_code_map,
clusterid_name_map)
In [15]:
clusterid_total_count =\
compute_clusterid_totalcounts(clusterid_code_map,
code_histogram)
print_cluster_stats(clusterid_name_map,
clusterid_total_count)
In [16]:
eval_maxcount_clusterid(clusterid_code_map,
clusterid_total_count,
code_histogram)
In [17]:
add_new_cluster(1,
'grfiti',
clusterid_total_count,
clusterid_code_map,
clusterid_name_map)
In [18]:
clusterid_total_count =\
compute_clusterid_totalcounts(clusterid_code_map,
code_histogram)
print_cluster_stats(clusterid_name_map,
clusterid_total_count)
In [19]:
eval_maxcount_clusterid(clusterid_code_map,
clusterid_total_count,
code_histogram)
In [20]:
add_new_cluster(1,
'dapub1',
clusterid_total_count,
clusterid_code_map,
clusterid_name_map)
In [21]:
clusterid_total_count =\
compute_clusterid_totalcounts(clusterid_code_map,
code_histogram)
print_cluster_stats(clusterid_name_map,
clusterid_total_count)
plot_cluster_stats(clusterid_code_map,
clusterid_total_count)
In [22]:
cur_clusterid = 0
clusterid_category_map = {}
clusterid_category_map[cur_clusterid] = 'streetmaintenance'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [23]:
clusterid_category_map[cur_clusterid] = 'miscellaneous'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [24]:
clusterid_category_map[cur_clusterid] = 'trashcart'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [25]:
clusterid_category_map[cur_clusterid] = 'buildinghazzard'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [26]:
clusterid_category_map[cur_clusterid] = 'buildingcomplaint'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [27]:
clusterid_category_map[cur_clusterid] = 'repairrequest'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [28]:
clusterid_category_map[cur_clusterid] = 'propertymaintenance'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [29]:
clusterid_category_map[cur_clusterid] = 'defaultrequest'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [30]:
clusterid_category_map[cur_clusterid] = 'propertycomplaint'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [31]:
clusterid_category_map[cur_clusterid] = 'trashcomplaint'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [32]:
clusterid_category_map[cur_clusterid] = 'servicecompliment'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [33]:
clusterid_category_map[cur_clusterid] = 'inspection'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [34]:
clusterid_category_map[cur_clusterid] = 'servicecomplaint'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [35]:
clusterid_category_map[cur_clusterid] = 'buildinginspection'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [36]:
clusterid_category_map[cur_clusterid] = 'buildingcomplaint'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [37]:
clusterid_category_map[cur_clusterid] = 'signmaintenance'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [38]:
clusterid_category_map[cur_clusterid] = 'requestforservice'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [39]:
clusterid_category_map[cur_clusterid] = 'litter'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [40]:
clusterid_category_map[cur_clusterid] = 'recycling'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid +=1
In [41]:
clusterid_category_map[cur_clusterid] = 'treemaintenance'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [42]:
clusterid_category_map[cur_clusterid] = 'metalfurniturecollection'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [43]:
clusterid_category_map[cur_clusterid] = 'yardwaste'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [44]:
clusterid_category_map[cur_clusterid] = 'graffitiremoval'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [45]:
clusterid_category_map[cur_clusterid] = 'deadanimal'
print_clustered_servicenames(cur_clusterid,
clusterid_name_map)
cur_clusterid += 1
In [46]:
clusterid_category_map
Out[46]:
In [47]:
import pandas as pd
category_totalcountdf =\
pd.DataFrame({'totalcount': clusterid_total_count.values()},
index=clusterid_category_map.values())
sns.set(font_scale=1.5)
category_totalcountdf.plot(kind='barh')
Out[47]:
In [48]:
servicecode_category_map = {}
for clusterid in clusterid_name_map.keys():
cur_category = clusterid_category_map[clusterid]
for servicecode in clusterid_code_map[clusterid]:
servicecode_category_map[servicecode] = cur_category
with open('serviceCodeCategory.txt', 'w') as fp:
num_names = len(servicecode_category_map)
keys = servicecode_category_map.keys()
values = servicecode_category_map.values()
for idx in range(0, num_names):
if idx == 0:
fp.write("%s{\"%s\": \"%s\",\n" % (" " * 12,
keys[idx],
values[idx]))
#----------------------------------------
elif idx > 0 and idx < num_names-1:
fp.write("%s\"%s\": \"%s\",\n" % (" " * 13,
keys[idx],
values[idx]))
#----------------------------------------
else:
fp.write("%s\"%s\": \"%s\"}" % (" " * 13,
keys[idx],
values[idx]))
In [ ]: