In [1]:
from lxml import etree
import re
import math
import numpy as np
import pandas as pd
from pprint import pprint
from time import time
# import edward as ed
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import TransformerMixin
from sklearn.cluster import DBSCAN
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
%matplotlib inline
In [2]:
class Text():
def __init__(self, lenRange=(2000, 10000), chunks=False):
self.lenRange=lenRange
self.chunks=chunks
def fit(self, *_):
print('heyo! fitting')
return self
def transform(self, filename):
print('heyo! transforming')
lenMin, lenMax = self.lenRange
self.tree = etree.parse(filename)
self.allSaidElems = self.tree.findall('.//said[@who]')
# Only get those in our length range
self.saidElems = [elem for elem in self.allSaidElems if len(elem.text)>lenMin and len(elem.text)<lenMax]
self.allChars = [elem.attrib['who'] for elem in self.saidElems]
self.chars = list(set(self.allChars))
self.labeledText = [(elem.attrib['who'], elem.text) for elem in self.saidElems]
self.labeledText = [(item[0], self.clean(item[1])) for item in self.labeledText]
self.labels = [item[0] for item in self.labeledText]
charDict = {'Bernard': 0, 'Louis': 1, 'Neville': 2,
'Rhoda': 3, 'Jinny': 4, 'Susan': 5}
self.numericLabels = [charDict[label] for label in self.labels]
self.allText = [item[1] for item in self.labeledText]
self.charDict = self.makeCharDict()
self.charChunks, self.charChunksLabels = self.makeCharChunks()
if self.chunks:
self.allText = self.charChunks
self.labels = self.charChunksLabels
self.lengths = [len(item) for item in self.allText]
return self.allText
def makeCharDict(self):
""" Make a dictionary of each character's total speech. """
# Initialize empty dictionary.
charDict = {char: "" for char in self.chars}
for elem in self.allSaidElems:
charDict[elem.attrib['who']]+=elem.text
return charDict
def makeCharChunks(self, n=2):
""" Make a list of chunks of character speech. """
charChunks = []
charChunksLabels = []
for char, text in self.charDict.items():
#chunks = self.chunkText(text, n)
chunks = self.sliceText(text)
for i, chunk in enumerate(chunks):
charChunks.append(chunk)
charChunksLabels.append(char + '-%s' % i)
return charChunks, charChunksLabels
def chunkText(self, text, n=2):
""" Breaks one text into n texts."""
newLen = math.floor(len(text) / n)
parts = [text[i:i+newLen] for i in range(0, len(text), newLen)]
if len(parts) > n:
parts[-2]+=parts[-1]
parts = parts[:n]
return parts
def sliceText(self, text, size=8000):
parts = []
while len(text) > size:
part = text[:size]
text = text[size:]
parts.append(part)
return parts
def clean(self, utterance):
"""
Cleans utterances.
"""
# Remove "said Bernard," etc.
charRegex = "said (%s)," % '|'.join(self.chars)
out = re.sub(charRegex, '', utterance)
# Remove quotation marks.
out = re.sub('[“”"]', '', out)
# Remove line breaks.
out = re.sub('\n', ' ', out)
return out
In [17]:
# Code adapted from http://stackoverflow.com/a/28384887/584121
class DenseTransformer(TransformerMixin):
def __init__(self, *args, **kwargs):
return
def get_params(self, deep=True):
""" Dummy method. """
return {'None': 'None'}
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
In [18]:
labels = text_clf.named_steps.get('text').labels
numLabels = text_clf.named_steps.get('text').numericLabels
word_names = text_clf.named_steps.get('tfidf').get_feature_names()
In [19]:
lenmin, lenmax = (0, 200000000)
text = 'waves-tei.xml'
text_clf = Pipeline([
# ('text', Text(lenRange=(lenmin, lenmax))),
('tfidf', TfidfVectorizer(use_idf=False, max_features=100)),
('todense', DenseTransformer()),
# ('SVD', TruncatedSVD()),
('PCA', PCA(n_components=10)),
('bgm', BayesianGaussianMixture(n_components=6)),
])
tclf = text_clf.fit(text)
predictions = tclf.predict(text)
predictions
In [25]:
parameters = { 'tfidf__use_idf': (True, False),
'tfidf__max_features': (100, 200, 500),
'pca__n_components': (2, 5, 10, 20, 30, 50)}
In [26]:
text = Text().fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels
In [27]:
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('todense', DenseTransformer()),
('pca', PCA(n_components=10)),
('bgm', BayesianGaussianMixture(n_components=6)),
])
In [28]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(docs, labels)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
In [ ]:
# svd = text_clf.named_steps.get('SVD')
# x, y = svd.components_[0,:], svd.components_[1,:]
pca = text_clf.named_steps.get('PCA')
x, y = pca.components_[0], pca.components_[1]
In [ ]:
metrics.adjusted_rand_score(numLabels, predictions)
In [ ]:
def translateNumColor(colorList):
colorDict = 'rgbcymk'
return [colorDict[numColor] for numColor in colorList]
In [ ]:
colors = translateNumColor(numLabels)
In [ ]:
plt.scatter(x, y, c=colors)
In [ ]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.15, max_df=0.4)
tfidfs = tfidf_vectorizer.fit_transform(docs).todense()
df = pd.DataFrame(tfidfs, columns=tfidf_vectorizer.get_feature_names())
In [ ]:
def addLabels(df):
df['label'] = t.labels
df['char'] = df['label'].str.split('-').str.get(0)
df['lengths'] = t.lengths
df['lengths'] = df['lengths'].divide(df['lengths'].max())
df['lengths'] = df['lengths'] * 40
charDict = {'Bernard': 0, 'Louis': 1, 'Neville': 2,
'Rhoda': 3, 'Jinny': 4, 'Susan': 5}
df['numLabel'] = df['label'].apply(lambda x: charDict[x])
In [ ]:
# Do latent semantic analysis dimensionality reduction.
svd = TruncatedSVD(n_components=2)
lsa_out = svd.fit_transform(df)
lsaDF = pd.DataFrame(lsa_out, columns=['x','y'])
addLabels(lsaDF)
In [ ]:
# Do PCA
pca = PCA(n_components=2)
pca_out = pca.fit_transform(df)
pcaDF = pd.DataFrame(pca_out, columns=['x', 'y'])
addLabels(pcaDF)
In [ ]:
def translateColor(label):
colorDict = {'B': 'b', 'S': 'g', 'R': 'r', 'N': 'c', 'J': 'y', 'L': 'm'}
return colorDict[label[0]]
In [ ]:
def plotChars(df, dims=2):
if dims == 3:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for char in t.chars:
charDF = df.loc[df['char'] == char]
if dims == 2:
plt.scatter(charDF['x'], charDF['y'], color=translateColor(char), s=charDF['lengths'], label=char)
#plt.legend()
if dims == 3:
ax.scatter(charDF['x'], charDF['y'], charDF['z'], color=translateColor(char),
s=charDF['lengths'], label=char)
In [ ]:
def doKMeans(df):
km = KMeans(n_clusters=6)
fitted = km.fit_transform(df[['x', 'y']])
means = np.argmax(fitted, axis=1)
df['mean']=means
colorMap = 'rgbcmyk'
for row in df.iterrows():
plt.scatter(row[1]['x'], row[1]['y'], color=colorMap[row[1]['mean']])
print(metrics.adjusted_rand_score(pcaDF['numLabel'], pcaDF['mean']))
return means
In [ ]:
def doInference(df):
gm = BayesianGaussianMixture(n_components=6, \
covariance_type="full")
dfXY = df[['x', 'y']]
fitted = gm.fit(dfXY)
colorMap = 'rgbcmyk'
df['mean'] = gm.predict(dfXY)
for row in df.iterrows():
plt.scatter(row[1]['x'], row[1]['y'], color=colorMap[row[1]['mean']])
print(metrics.adjusted_rand_score(df['numLabel'], df['mean']))
return df['mean']
In [ ]:
means = doKMeans(pcaDF)
In [ ]:
means = doInference(lsaDF)
In [ ]:
plotChars(pcaDF)
In [ ]:
plotChars(lsaDF)
In [ ]: