In [1]:
import pickle
In [2]:
import numpy
import scipy
import pandas
import spacy
import textacy
In [3]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
In [24]:
with open('/home/immersinn/Dropbox/Analytics/NCGA/data/bill_page_keywords.pkl', 'rb') as f1:
keywords = pickle.load(f1)
In [25]:
keywords.head()
Out[25]:
In [26]:
from collections import defaultdict
In [27]:
keycounts = defaultdict(int)
In [28]:
def updateKeycounts(kws):
for kw in kws:
keycounts[kw] += 1
In [29]:
_ = keywords.apply(lambda x: updateKeycounts(x.keywords), axis=1)
In [30]:
keycounts = pandas.DataFrame({"word" : [w for w in keycounts.keys()],
"count" : [c for c in keycounts.values()]})
In [31]:
keycounts.sort_values(by="count", ascending=False, inplace=True)
keycounts.index = range(keycounts.shape[0])
In [32]:
keycounts.head()
Out[32]:
In [33]:
keycounts['count'].describe(percentiles=[.25, .50, .75, .80, .85, .90, .95])
Out[33]:
In [34]:
g = sns.barplot(x="word", y="count", data=keycounts[keycounts['count']>=100], estimator=sum);
g.figure.set_size_inches(15,10);
plt.xticks(rotation="vertical", size=12);
plt.title('Total Occurrences of Each Keyword', size=14);
plt.ylabel("");
In [35]:
from sklearn import metrics, feature_extraction, feature_selection, cluster
In [36]:
def list2dict(l):
return({n : 1 for n in l})
In [37]:
keywords['kyd'] = keywords.apply(lambda x: list2dict(x.keywords), axis=1)
In [38]:
keywords.kyd[0]
Out[38]:
In [39]:
kwDV = feature_extraction.DictVectorizer()
In [40]:
kw_feats = kwDV.fit_transform(keywords['kyd']).todense()
In [41]:
kw_feats.shape
Out[41]:
In [42]:
len(kwDV.get_feature_names())
Out[42]:
In [43]:
# Restrict the features being used
# keywords must occur at least 10 times in the data;
# larger requirements result in too many groups due to
count_cutoff = 1
support = keycounts.apply(lambda x: x['count'] >= count_cutoff, axis=1)
In [44]:
kwDV.restrict(support)
Out[44]:
In [45]:
len(kwDV.get_feature_names())
Out[45]:
In [46]:
kw_feats = kwDV.transform(keywords['kyd']).todense()
In [47]:
kw_feats.shape
Out[47]:
In [48]:
pandas.Series(numpy.array(kw_feats.sum(axis=1)).reshape([2100,])).describe()
Out[48]:
In [49]:
kw_cos = metrics.pairwise.cosine_similarity(kw_feats)
In [50]:
bandwidth = cluster.estimate_bandwidth(kw_feats, quantile=0.3, n_samples=1000, n_jobs=4)
In [51]:
ms = cluster.MeanShift(bandwidth=bandwidth, n_jobs=4)
ms.fit(kw_feats)
Out[51]:
In [52]:
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = numpy.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
In [53]:
cc_vals = pandas.Series(cluster_centers.reshape([cluster_centers.size,]))
In [54]:
cc_vals.describe(percentiles=[0.50, 0.80, 0.90, 0.95, 0.975, 0.99])
Out[54]: