========================
In [7]:
%matplotlib inline
# Load this library to make the graphs interactive for smaller samples
#import mpld3
#mpld3.enable_notebook()
# Turns out, multiple interactive scattergraphs with 170,000+ points each is a bit too much for a browser
# Who knew?!
In [1]:
from clustering_capitals import create_cluster_dataset, NewspaperArchive
DBFILE = "1745-55.db"
n = NewspaperArchive(textareas="/datastore/burneytextareas")
a = n.get_areas(newspaper="B0574REMEMBRA", year = "1748", month = "03", day = "05")
pg1a1 = a['0001']['001'][0]
print(len(pg1a1['lines']), len(pg1a1['line_widths'][:len(pg1a1['lines'])-1]))
print(pg1a1['line_widths'][:len(pg1a1['lines'])-1][-1])
In [2]:
# Get/create the dataset:
ds = create_cluster_dataset(n, daterange = [1745, 1755], dbfile = DBFILE) # , refresh = True)
What do these 'vectors' look like? What do the columns refer to?
In [3]:
data, transform, id_list = ds
print(data)
print(transform.get_feature_names())
In [4]:
from clustering_capitals import ClusterDB
db = ClusterDB(DBFILE)
item = dict(db.vecidtoitem(id_list[1]))
print(item)
print(transform.inverse_transform(data[1]))
from burney_data import BurneyDB
bdb = BurneyDB("burney.db")
titlemd = bdb.get_title_row(titleAbbreviation=item['newspaper'])
entry = bdb.get_entry_row(year=item['year'], month=item['month'], day=item['day'], title_id= titlemd['id'])
issue = bdb.get_issue_row(id=entry['issue_id'])
print(titlemd)
print(issue)
print(entry)
vector = db.vector(id_list[1])
print(dict(vector))
mask = {'ave_lsp': 1.0, 'density':1.0, 'ltcount':0.0, 'redge_x2ave':0.0, 'st_caps':1.0,
'st_nums':1.0, 'x1_var1':1.0, 'x1_var2':0.0, 'x1ave_ledge':0.0, 'x2_var1':1.0, 'x2_var2':0.0}
m_vec = transform.transform(mask)
print(m_vec)
In [5]:
import numpy as np
from matplotlib import pyplot as plt
# Mask off leaving just the left and right variance columns
npdata = data.toarray()
mask = np.ones((11), dtype=bool)
# remember: ['ave_lsp', 'density', 'ltcount', 'redge_x2ave', 'st_caps',
# 'st_nums', 'x1_var1', 'x1_var2', 'x1ave_ledge', 'x2_var1', 'x2_var2']
mask[[0,1,2,3,4,5,7,8,10]] = False
marray = npdata[:,mask]
In [ ]:
plt.scatter(marray[:,0], marray[:,1], marker = ".", s = [2] * len(marray), linewidths=[0.0] * len(marray))
plt.show()
In [ ]:
# Build the clustering and show the individual clusters as best we can:
from sklearn.cluster import KMeans
cl_mask = np.ones((11), dtype=bool)
# remember: ['ave_lsp', 'density', 'ltcount', 'redge_x2ave', 'st_caps',
# 'st_nums', 'x1_var1', 'x1_var2', 'x1ave_ledge', 'x2_var1', 'x2_var2']
# so, we should cluster on ave_lsp, density, st_caps, st_nums, x1_var1, x2_var1:
cl_mask[[2,3,7,8,10]] = False
cl_marray = npdata[:,cl_mask]
estimator = KMeans(n_clusters=12)
clusters = estimator.fit(cl_marray)
labels = estimator.labels_
def isol(label, labels):
for l in labels.astype(np.float):
if l != label:
yield "#444444"
else:
yield "#FF3355"
def highlight(label, labels):
for l in labels.astype(np.float):
if l != label:
yield 2
else:
yield 4
# plot graphs of ave_lsp vs x2_var1?
for label in set(labels):
print("Cluster: {0} - x1_var1 vs x2_var2".format(label))
plt.scatter(cl_marray[:,4], cl_marray[:,5], c=list(isol(label, labels)), marker = ".",
s = list(highlight(label, labels)), linewidths=[0.0] * len(marray))
plt.show()
It looks like cluster 4 and perhaps cluster 11 are ones that should contain more complete poems than the rest if our assumptions are correct. Clump with very low x1 (lefthand edge) variance, but high x2 (right hand side).
What do the other aspects of 4 and 11 look like?
In [9]:
# plot graphs of ave_lsp vs x2_var1?
for label in [4,11]:
print("Cluster: {0} - ave_lp vs density".format(label))
plt.scatter(cl_marray[labels == label,0], cl_marray[labels == label,1], marker = ".", linewidths=1)
plt.show()
print("Cluster: {0} - st_caps vs st_num".format(label))
plt.scatter(cl_marray[labels == label,2], cl_marray[labels == label,3], marker = ".", linewidths=1)
plt.show()
print("Cluster: {0} - x1_var1 vs x2_var2".format(label))
plt.scatter(cl_marray[labels == label,4], cl_marray[labels == label,5], marker = ".", linewidths=1)
plt.show()
Lets export this as a list of references to explore further - "clusterX.csv"
In [11]:
import csv
def get_info(item_id):
record = dict(db.vecidtoitem(item_id))
vect = dict(db.vector(item_id))
titlemd = bdb.get_title_row(titleAbbreviation=record['newspaper'])
entry = bdb.get_entry_row(year=record['year'], month=record['month'], day=record['day'], title_id= titlemd['id'])
issue = bdb.get_issue_row(id=entry['issue_id'])
record.update(titlemd)
record.update(entry)
record.update(issue)
record.update(vect)
return record
for label in set(labels):
print("Saving label {0}".format(label))
with open("exp2_cluster{0}.csv".format(label), "w") as cfn:
fields = ["title", "titleAbbreviation", "year", "month", "day",
"issueNumber", "printedDate", "page", "article", "block_number", "filepath", "st_caps", "st_nums", "x1_var1", "x2_var1", "ltcount"]
csvdoc = csv.DictWriter(cfn, fieldnames = fields)
csvdoc.writerow(dict([(x,x) for x in fields]))
count = 0
for idx, vlabel in enumerate(list(labels)):
if idx % 1000 == 0:
print("Tackling line {0} - saved {1} lines for this label".format(idx, count))
if vlabel == label:
record = get_info(id_list[idx])
csvdoc.writerow(dict([(x,record[x]) for x in fields]))
count += 1
In [ ]: