In [1]:
from lxml import etree
import re
import math
import numpy as np
import pandas as pd
from pprint import pprint
from time import time
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
%matplotlib inline
In [2]:
class Text(BaseEstimator, TransformerMixin):
def __init__(self, lenMin=2000, lenMax=10000, chunks=False):
self.lenMin = lenMin
self.lenMax = lenMax
self.chunks=chunks
def fit(self, *_):
print('heyo! fitting')
return self
def transform(self, filename):
print('heyo! transforming')
lenMin, lenMax = self.lenMin, self.lenMax
self.tree = etree.parse(filename)
self.allSaidElems = self.tree.findall('.//said[@who]')
# Only get those in our length range
self.saidElems = [elem for elem in self.allSaidElems if len(elem.text)>lenMin and len(elem.text)<lenMax]
self.allChars = [elem.attrib['who'] for elem in self.saidElems]
self.chars = list(set(self.allChars))
self.labeledText = [(elem.attrib['who'], elem.text) for elem in self.saidElems]
self.labeledText = [(item[0], self.clean(item[1])) for item in self.labeledText]
self.labels = [item[0] for item in self.labeledText]
charDict = {'Bernard': 0, 'Louis': 1, 'Neville': 2,
'Rhoda': 3, 'Jinny': 4, 'Susan': 5}
self.numericLabels = [charDict[label] for label in self.labels]
self.allText = [item[1] for item in self.labeledText]
self.charDict = self.makeCharDict()
self.charChunks, self.charChunksLabels = self.makeCharChunks()
if self.chunks:
self.allText = self.charChunks
self.labels = self.charChunksLabels
self.numericLabels = [charDict[label.split('-')[0]] for label in self.labels]
self.lengths = [len(item) for item in self.allText]
return self.allText
def makeCharDict(self):
""" Make a dictionary of each character's total speech. """
# Initialize empty dictionary.
charDict = {char: "" for char in self.chars}
for elem in self.allSaidElems:
charDict[elem.attrib['who']]+=self.clean(elem.text)
return charDict
def makeCharChunks(self, n=2):
""" Make a list of chunks of character speech. """
charChunks = []
charChunksLabels = []
for char, text in self.charDict.items():
chunks = self.sliceText(text)
for i, chunk in enumerate(chunks):
charChunks.append(chunk)
charChunksLabels.append(char + '-%s' % i)
return charChunks, charChunksLabels
def sliceText(self, text, size=8000):
parts = []
while len(text) > size:
part = text[:size]
text = text[size:]
parts.append(part)
return parts
def clean(self, utterance):
"""
Cleans utterances.
"""
# Remove "said Bernard," etc.
charRegex = "said (%s)" % '|'.join(self.chars)
out = re.sub(charRegex, '', utterance)
# Remove quotation marks.
out = re.sub('[“”"]', '', out)
# Remove line breaks.
out = re.sub('\n', ' ', out)
return out
In [3]:
# Code adapted from http://stackoverflow.com/a/28384887/584121
class DenseTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
return
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
In [4]:
text = Text().fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
In [5]:
len(docs), len(labels)
Out[5]:
In [6]:
def verboseGridSearch(pipeline, parameters, docs, labels):
scorer = metrics.make_scorer(metrics.adjusted_rand_score)
grid_search = GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(docs, labels)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [7]:
pipeline = Pipeline([
# ('text', Text()),
('tfidf', TfidfVectorizer()),
('todense', DenseTransformer()),
('pca', PCA()),
('bgm', BayesianGaussianMixture()),
])
In [8]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
'tfidf__min_df': (0.0, 0.3, 0.4),
'tfidf__max_df': (0.5, 0.7, 1.0),
'tfidf__max_features': (100, 200, 500, 1000, 2000),
'pca__n_components': (2, 5, 10, 20, 30, 50),
'bgm__n_components': (2, 4, 6)
}
verboseGridSearch(pipeline, parameters, docs, labels)
In [11]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
'tfidf__max_df': (0.2, 0.3, 0.4, 0.5, 0.6),
'tfidf__max_features': (100, 200, 500, 1000, 10000, 100000),
'pca__n_components': (5, 10, 20, 25, 50),
'bgm__n_components': (2, 4, 6, 8)
}
verboseGridSearch(pipeline, parameters, docs, labels)
In [12]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
'tfidf__max_df': (0.2, 0.3, 0.4, 1),
'tfidf__max_features': (200, 500, 1000, None),
'pca__n_components': (2, 5, 25, 35),
'bgm__n_components': (2, 4, 6, 8)
}
verboseGridSearch(pipeline, parameters, docs, labels)
In [14]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
'tfidf__min_df': (0.0, 1),
'tfidf__max_df': (0.2, 0.3, 0.4, 1.0),
'tfidf__max_features': (200, 500, 1000, None),
'pca__n_components': (2, 5, 25, 35, 50),
'bgm__n_components': (2, 4, 6, 8, 10)
}
verboseGridSearch(pipeline, parameters, docs, labels)
In [15]:
text = Text(lenMin=0, lenMax=1000000000).fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
In [16]:
len(docs), len(labels)
Out[16]:
In [17]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
# 'tfidf__max_df': (0.5, 0.7, 1.0),
'tfidf__max_features': (100, 500, 1000, None),
'pca__n_components': (2, 5, 25, 30),
'bgm__n_components': (6, 8, 10)
}
verboseGridSearch(pipeline, parameters, docs, labels)
In [18]:
text = Text(lenMin=2000, lenMax=1000000000000, chunks=True).fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
In [19]:
len(docs), len(labels)
Out[19]:
In [21]:
pd.Series([len(text) for text in docs]).hist()
Out[21]:
In [22]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
# 'tfidf__max_df': (0.5, 0.7, 1.0),
'tfidf__max_features': (100, 500, 1000, None),
'pca__n_components': (2, 5, 25, 30),
'bgm__n_components': (6, 8, 10)
}
verboseGridSearch(pipeline, parameters, docs, labels)
In [27]:
text = Text(lenMin=3000, lenMax=20000).fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
pd.Series([len(text) for text in docs]).hist()
Out[27]:
In [28]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
# 'tfidf__max_df': (0.5, 0.7, 1.0),
'tfidf__max_features': (10000, 20000, 50000, None),
'pca__n_components': (25, 30, 35),
'bgm__n_components': (6, 8, 10)
}
In [29]:
verboseGridSearch(pipeline, parameters, docs, labels)
In [31]:
text = Text(lenMin=0, lenMax=10000).fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
pd.Series([len(text) for text in docs]).hist()
Out[31]:
In [32]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
# 'tfidf__max_df': (0.5, 0.7, 1.0),
'tfidf__max_features': (10000, 20000, 50000, None),
'pca__n_components': (25, 30, 35),
'bgm__n_components': (6, 8, 10)
}
In [33]:
verboseGridSearch(pipeline, parameters, docs, labels)
In [34]:
text = Text(lenMin=4000, lenMax=10000000000).fit()
docs = text.tr10000010000010000000000rm('waves-tei.xml')
labels = text.numericLabels
pd.Series([len(text) for text in docs]).hist()
Out[34]:
In [35]:
len(docs)
Out[35]:
In [36]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
# 'tfidf__max_df': (0.5, 0.7, 1.0),
'tfidf__max_features': (10000, 20000, 50000, None),
'pca__n_components': (25, 30, 35),
'bgm__n_components': (6, 8, 10)
}
In [37]:
verboseGridSearch(pipeline, parameters, docs, labels)
In [39]:
text = Text(lenMin=2000, lenMax=50000).fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
pd.Series([len(text) for text in docs]).hist()
Out[39]:
In [40]:
len(docs)
Out[40]:
In [41]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
'tfidf__max_df': (0.3, 0.4, 0.5, 1.0),
'tfidf__max_features': (100, 500, 5000, None),
'pca__n_components': (2, 5, 25, 35),
'bgm__n_components': (2, 6, 8, 10)
}
In [42]:
verboseGridSearch(pipeline, parameters, docs, labels)
In [43]:
text = Text(lenMin=4000, lenMax=20000).fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
pd.Series([len(text) for text in docs]).hist()
Out[43]:
In [44]:
len(docs)
Out[44]:
In [45]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
'tfidf__max_df': (0.3, 0.4, 0.5, 1.0),
'tfidf__max_features': (100, 500, 5000, None),
'pca__n_components': (2, 5, 25, 35),
'bgm__n_components': (2, 6, 8, 10)
}
In [46]:
verboseGridSearch(pipeline, parameters, docs, labels)
In [50]:
parameters = {
# 'text__lenMin': (2000, 3000),
# 'text__lenMax': (20000, 2000000),
# 'tfidf__use_idf': (True, False),
# 'tfidf__min_df': (0.0, 0.3, 0.4),
'tfidf__max_df': (0.1, 0.2, 0.3, 0.4, 1.0),
'tfidf__max_features': (100, 400, 500, 5000, None),
'pca__n_components': (2, 4, 5, 25, 35),
'bgm__n_components': (2, 6, 8, 10)
}
In [51]:
verboseGridSearch(pipeline, parameters, docs, labels)
In [ ]: