In [1]:
from lxml import etree
import re
import math
import numpy as np
import pandas as pd
from pprint import pprint
from time import time
# import edward as ed
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import TransformerMixin
from sklearn.cluster import DBSCAN
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
%matplotlib inline

In [2]:
class Text(): 
    def __init__(self, lenRange=(2000, 10000), chunks=False): 
        self.lenRange=lenRange
        self.chunks=chunks
        
    def fit(self, *_):
        print('heyo! fitting')
        return self

    def transform(self, filename): 
        print('heyo! transforming')
        lenMin, lenMax = self.lenRange
        self.tree = etree.parse(filename)
        self.allSaidElems = self.tree.findall('.//said[@who]')
        # Only get those in our length range
        self.saidElems = [elem for elem in self.allSaidElems if len(elem.text)>lenMin and len(elem.text)<lenMax]
        self.allChars = [elem.attrib['who'] for elem in self.saidElems]
        self.chars = list(set(self.allChars))
        self.labeledText = [(elem.attrib['who'], elem.text) for elem in self.saidElems]
        self.labeledText = [(item[0], self.clean(item[1])) for item in self.labeledText]
        self.labels = [item[0] for item in self.labeledText]
        charDict = {'Bernard': 0, 'Louis': 1, 'Neville': 2,
                'Rhoda': 3, 'Jinny': 4, 'Susan': 5}
        self.numericLabels = [charDict[label] for label in self.labels]
        self.allText = [item[1] for item in self.labeledText]
        self.charDict = self.makeCharDict()
        self.charChunks, self.charChunksLabels = self.makeCharChunks()
        if self.chunks: 
            self.allText = self.charChunks
            self.labels = self.charChunksLabels
        self.lengths = [len(item) for item in self.allText]
        return self.allText
        
    def makeCharDict(self): 
        """ Make a dictionary of each character's total speech. """
        # Initialize empty dictionary. 
        charDict = {char: "" for char in self.chars}
        for elem in self.allSaidElems: 
            charDict[elem.attrib['who']]+=elem.text
        return charDict
            
    def makeCharChunks(self, n=2): 
        """ Make a list of chunks of character speech. """
        charChunks = []
        charChunksLabels = []
        for char, text in self.charDict.items(): 
            #chunks = self.chunkText(text, n)
            chunks = self.sliceText(text)
            for i, chunk in enumerate(chunks): 
                charChunks.append(chunk)
                charChunksLabels.append(char + '-%s' % i)
        return charChunks, charChunksLabels
        
    def chunkText(self, text, n=2): 
        """ Breaks one text into n texts."""
        newLen = math.floor(len(text) / n)
        parts = [text[i:i+newLen] for i in range(0, len(text), newLen)]
        if len(parts) > n: 
            parts[-2]+=parts[-1]
            parts = parts[:n]
        return parts
    
    def sliceText(self, text, size=8000):
        parts = []
        while len(text) > size: 
            part = text[:size]
            text = text[size:]
            parts.append(part)
        return parts

    def clean(self, utterance): 
        """ 
        Cleans utterances. 
        """
        # Remove "said Bernard," etc. 
        charRegex = "said (%s)," % '|'.join(self.chars)
        out = re.sub(charRegex, '', utterance)
       
        # Remove quotation marks. 
        out = re.sub('[“”"]', '', out)
        
        # Remove line breaks. 
        out = re.sub('\n', ' ', out)
        return out

In [17]:
# Code adapted from http://stackoverflow.com/a/28384887/584121
class DenseTransformer(TransformerMixin):
    
    def __init__(self, *args, **kwargs): 
        return

    def get_params(self, deep=True): 
        """ Dummy method. """
        return {'None': 'None'}
    
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [18]:
labels = text_clf.named_steps.get('text').labels
numLabels = text_clf.named_steps.get('text').numericLabels
word_names = text_clf.named_steps.get('tfidf').get_feature_names()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-b9c5404e07d4> in <module>()
----> 1 labels = text_clf.named_steps.get('text').labels
      2 numLabels = text_clf.named_steps.get('text').numericLabels
      3 word_names = text_clf.named_steps.get('tfidf').get_feature_names()

AttributeError: 'NoneType' object has no attribute 'labels'

In [19]:
lenmin, lenmax = (0, 200000000)
text = 'waves-tei.xml'
text_clf = Pipeline([
#                         ('text', Text(lenRange=(lenmin, lenmax))),
                         ('tfidf', TfidfVectorizer(use_idf=False, max_features=100)),
                         ('todense', DenseTransformer()),
#                          ('SVD', TruncatedSVD()),
                         ('PCA', PCA(n_components=10)),
                         ('bgm', BayesianGaussianMixture(n_components=6)),
                        ])
tclf = text_clf.fit(text)
predictions = tclf.predict(text)
predictions


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-9f88062f5c53> in <module>()
      9                          ('bgm', BayesianGaussianMixture(n_components=6)),
     10                         ])
---> 11 tclf = text_clf.fit(text)
     12 predictions = tclf.predict(text)
     13 predictions

/usr/lib/python3.5/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)

/usr/lib/python3.5/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/usr/lib/python3.5/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1350             Tf-idf-weighted document-term matrix.
   1351         """
-> 1352         X = super(TfidfVectorizer, self).fit_transform(raw_documents)
   1353         self._tfidf.fit(X)
   1354         # X is already a transformed view of raw_documents so

/usr/lib/python3.5/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
    828         if isinstance(raw_documents, six.string_types):
    829             raise ValueError(
--> 830                 "Iterable over raw text documents expected, "
    831                 "string object received.")
    832 

ValueError: Iterable over raw text documents expected, string object received.

In [25]:
parameters = { 'tfidf__use_idf': (True, False), 
               'tfidf__max_features': (100, 200, 500), 
               'pca__n_components': (2, 5, 10, 20, 30, 50)}

In [26]:
text = Text().fit()
docs = text.transform('waves-tei.xml')
labels = text.numericLabels


heyo! fitting
heyo! transforming

In [27]:
pipeline = Pipeline([
                         ('tfidf', TfidfVectorizer()),
                         ('todense', DenseTransformer()),
                         ('pca', PCA(n_components=10)),
                         ('bgm', BayesianGaussianMixture(n_components=6)),
                        ])

In [28]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(docs, labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['tfidf', 'todense', 'pca', 'bgm']
parameters:
{'pca__n_components': (2, 5, 10, 20, 30, 50),
 'tfidf__max_features': (100, 200, 500),
 'tfidf__use_idf': (True, False)}
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.2s
done in 8.285s

Best score: 14.794
Best parameters set:
	pca__n_components: 20
	tfidf__max_features: 500
	tfidf__use_idf: True
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:    8.0s finished

In [ ]:
# svd = text_clf.named_steps.get('SVD')
# x, y = svd.components_[0,:], svd.components_[1,:]
pca = text_clf.named_steps.get('PCA')
x, y = pca.components_[0], pca.components_[1]

In [ ]:
metrics.adjusted_rand_score(numLabels, predictions)

In [ ]:
def translateNumColor(colorList): 
    colorDict = 'rgbcymk'
    return [colorDict[numColor] for numColor in colorList]

In [ ]:
colors = translateNumColor(numLabels)

In [ ]:
plt.scatter(x, y, c=colors)

In [ ]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.15, max_df=0.4)
tfidfs = tfidf_vectorizer.fit_transform(docs).todense()
df = pd.DataFrame(tfidfs, columns=tfidf_vectorizer.get_feature_names())

In [ ]:
def addLabels(df): 
    df['label'] = t.labels
    df['char'] = df['label'].str.split('-').str.get(0)
    df['lengths'] = t.lengths
    df['lengths'] = df['lengths'].divide(df['lengths'].max())
    df['lengths'] = df['lengths'] * 40
    charDict = {'Bernard': 0, 'Louis': 1, 'Neville': 2,
                'Rhoda': 3, 'Jinny': 4, 'Susan': 5}
    df['numLabel'] = df['label'].apply(lambda x: charDict[x])

In [ ]:
# Do latent semantic analysis dimensionality reduction. 
svd = TruncatedSVD(n_components=2)
lsa_out = svd.fit_transform(df)
lsaDF = pd.DataFrame(lsa_out, columns=['x','y'])
addLabels(lsaDF)

In [ ]:
# Do PCA
pca = PCA(n_components=2)
pca_out = pca.fit_transform(df)
pcaDF = pd.DataFrame(pca_out, columns=['x', 'y'])
addLabels(pcaDF)

In [ ]:
def translateColor(label): 
    colorDict = {'B': 'b', 'S': 'g', 'R': 'r', 'N': 'c', 'J': 'y', 'L': 'm'}
    return colorDict[label[0]]

In [ ]:
def plotChars(df, dims=2): 
    if dims == 3: 
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
    for char in t.chars: 
        charDF = df.loc[df['char'] == char]
        if dims == 2: 
            plt.scatter(charDF['x'], charDF['y'], color=translateColor(char), s=charDF['lengths'], label=char)
            #plt.legend()
        if dims == 3:         
            ax.scatter(charDF['x'], charDF['y'], charDF['z'], color=translateColor(char), 
                       s=charDF['lengths'], label=char)

In [ ]:
def doKMeans(df):
    km = KMeans(n_clusters=6)
    fitted = km.fit_transform(df[['x', 'y']])
    means = np.argmax(fitted, axis=1)
    df['mean']=means
    colorMap = 'rgbcmyk'
    for row in df.iterrows(): 
        plt.scatter(row[1]['x'], row[1]['y'], color=colorMap[row[1]['mean']])
    print(metrics.adjusted_rand_score(pcaDF['numLabel'], pcaDF['mean']))
    return means

In [ ]:
def doInference(df): 
    gm = BayesianGaussianMixture(n_components=6, \
                                 covariance_type="full")
    dfXY = df[['x', 'y']]
    fitted = gm.fit(dfXY)
    colorMap = 'rgbcmyk'
    df['mean'] = gm.predict(dfXY)
    for row in df.iterrows():
        plt.scatter(row[1]['x'], row[1]['y'], color=colorMap[row[1]['mean']])
    print(metrics.adjusted_rand_score(df['numLabel'], df['mean']))
    return df['mean']

In [ ]:
means = doKMeans(pcaDF)

In [ ]:
means = doInference(lsaDF)

In [ ]:
plotChars(pcaDF)

In [ ]:
plotChars(lsaDF)

In [ ]: