In [167]:
from lxml import etree
import re
import math
import numpy as np
import pandas as pd
from collections import Counter
from pprint import pprint
from time import time
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN
from sklearn.decomposition import TruncatedSVD, PCA, NMF
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
import matplotlib.patches as mpatches
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (8.0, 4.0)
In [3]:
class Text(BaseEstimator, TransformerMixin):
def __init__(self, lenMin=2000, lenMax=2000000, maxChars=8, cutoff=12000):
self.charDict = {}
self.lenMin = lenMin
self.lenMax = lenMax
self.maxChars = maxChars
self.cutoff = cutoff
def fit(self, *_):
return self
def transform(self, filename):
self.tree = etree.parse(filename)
# First find and remove all letters-within-letters.
self.nestedLetters = self.tree.findall('.//floatingText[@type="letter"]')
for letter in self.nestedLetters:
self.parseLetter(letter)
letter.getparent().remove(letter)
# Parse letters
self.allLetters = self.tree.findall('.//div[@type="letter"]')
for letter in self.allLetters:
self.parseLetter(letter)
# Flatten charDict
self.allDocs = []
self.allLabels = []
for char in self.charDict:
for letter in self.charDict[char]:
self.allDocs.append(letter)
self.allLabels.append(char)
#Restrict to only the ones with appropriate lengths
self.docs = []
self.labels = []
self.topChars = dict(Counter(self.allLabels).most_common(self.maxChars))
for doc, label in zip(self.allDocs, self.allLabels):
if len(doc) > self.lenMin and len(doc) < self.lenMax:
if label in self.topChars:
self.docs.append(doc)
self.labels.append(label)
# Cut off documents artifically at a certain point,
# so that documents are more even.
if self.cutoff is not None:
self.docs = [doc[:self.cutoff] for doc in self.docs]
self.chars = list(set(self.labels))
self.numericLabels = [self.chars.index(char) for char in self.labels]
self.lengths = [len(doc) for doc in self.docs]
return self.docs
def plotLettersPerChar(self):
lettersPerChar = dict(Counter(self.labels).most_common(20))
pd.Series(lettersPerChar).plot(kind='bar')
def plotHist(self):
print('%s documents' % len(self.docs))
pd.Series(self.lengths).hist()
def parseLetter(self, letter):
# print(letter.tag, letter.sourceline)
if 'who' in letter.attrib:
attribution = letter.attrib['who']
# print('attribution: ', attribution)
elif len(letter.findall('.//signed[@who]'))>0:
signed = letter.findall('.//signed[@who]')
# print('signed: ', signed)
if len(signed) > 0:
attribution = signed[0].attrib['who']
# print('signed: ', attribution)
else:
attribution = None
if attribution is not None:
ps = letter.findall('.//p')
text = " ".join([" ".join(p.itertext()) for p in ps])
# print(text[:100])
if attribution in self.charDict:
self.charDict[attribution].append(text)
else:
self.charDict[attribution] = [text]
In [2]:
# Code adapted from http://stackoverflow.com/a/28384887/584121
class DenseTransformer(TransformerMixin):
def __init__(self, *args, **kwargs):
return
def get_params(self, deep=True):
""" Dummy method. """
return {'None': 'None'}
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
In [4]:
def translateNumColors(colorList):
colorDict = 'rgbcymkw'
return [colorDict[numColor] for numColor in colorList]
def translateNumColor(color):
colorDict = 'rgbcmyk'
return colorDict[color]
In [5]:
def verboseGridSearch(pipeline, parameters, docs, labels):
scorer = metrics.make_scorer(metrics.adjusted_rand_score)
grid_search = GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=-1, verbose=1)
print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
# print("parameters:")
# pprint(parameters)
t0 = time()
grid_search.fit(docs, labels)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
return grid_search
In [6]:
def plotLabeled(transformed, labels, wordLabels, lengths):
plt.scatter(transformed[:,0], transformed[:,1],
c=translateNumColors(labels), s=lengths)
# Build legend
colorLabelAssociations = list(set(list(zip(labels, wordLabels, translateNumColors(labels)))))
# print(colorLabelAssociations)
legends = [mpatches.Patch(color=assoc[2], label=assoc[1])
for assoc in colorLabelAssociations]
plt.legend(handles=legends, loc='lower right', fontsize='small')
In [177]:
text = Text(lenMin=8000, lenMax=5000000000, maxChars=6, cutoff=None).fit()
docs = text.transform('clarissa.xml')
labels = text.numericLabels
wordLabels = text.labels
lengths = [length/500 for length in text.lengths]
text.plotHist()
text_train, labels_train, wordLabels_train = docs[:10], labels[:10], wordLabels[:10]
In [8]:
len(text.docs), len(text.labels)
Out[8]:
In [ ]:
searchPipeline = Pipeline([
('tfidf', TfidfVectorizer(max_df=0.5,
max_features=500)),
('todense', DenseTransformer()),
('pca', PCA(n_components=5)),
('gmm', GaussianMixture(n_components=8)),
])
params = {
'tfidf__max_features': (500, 1000, None),
# 'tfidf__max_df': (0.5, 1.0),
# 'tfidf__use_idf': (True, False),
'pca': [PCA(), NMF(), TruncatedSVD()],
'pca__n_components': (2, 5, 10, 25, 50),
'gmm': [GaussianMixture(), BayesianGaussianMixture()],
'gmm__n_components': (4, 6, 8),
}
In [ ]:
searchresults = verboseGridSearch(searchPipeline, params, docs, labels)
In [197]:
transformPipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=None)),
('todense', DenseTransformer()),
('pca', PCA(n_components=10)),
# ('lda', LatentDirichletAllocation(n_topics=50))
# ('dict', DictionaryLearning(n_components=6))
# ('factor', FactorAnalysis(n_components=10))
])
In [198]:
transformed = transformPipeline.fit_transform(docs)
train_transformed = transformPipeline.fit_transform(text_train)
In [199]:
transformed.shape, train_transformed.shape
Out[199]:
In [200]:
gmm = GaussianMixture(n_components=4).fit(transformed)
bgm = BayesianGaussianMixture(n_components=4).fit(transformed)
assignments = gmm.predict(transformed)
In [201]:
mlp = MLPClassifier().fit(train_transformed, labels_train)
assignments = mlp.predict(transformed)
In [202]:
plotLabeled(transformed, labels, wordLabels, lengths)
In [203]:
plt.scatter(transformed[:,0], transformed[:,1],
c=translateNumColors(assignments), s=lengths)
Out[203]:
In [204]:
metrics.adjusted_rand_score(assignments, labels)
Out[204]:
In [205]:
metrics.adjusted_mutual_info_score(assignments, labels)
Out[205]:
In [ ]: