In [1]:
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
import time
import itertools as it
from scipy.io import loadmat

In [2]:
from microscopes.models import bb as beta_bernoulli
from microscopes.mixture.definition import model_definition
from microscopes.common.rng import rng
from microscopes.common.recarray.dataview import numpy_dataview
from microscopes.common.scalar_functions import log_exponential
from microscopes.mixture.model import initialize, bind
from microscopes.kernels import gibbs, slice

In [3]:
# create random seed
r = rng()

In [68]:
# NIPS 2012 papers, represented in "bag-of-words" format
data = loadmat('nips12raw_str602.mat', variable_names=['counts', 'wl'])
counts = data['counts']
wordlist = [x[0][0] for x in data['wl']]

# note that ptitles defined above cannot be loaded with loadmat() due to 
# a bug in the decoder
with open('ptitles.txt') as fp:
    titles = [x.strip() for x in fp.readlines()]

In [69]:
# put counts into a format we can deal with
counts = counts.todense()
counts = np.array(counts)
counts = counts.T # (documents x wordbag)

In [6]:
# the original data dimensions
N, W = counts.shape
print N, W


1740 13649

In [8]:
# for our simplistic model, we convert counts into
# binary "word occurs/word doesn't occur"
occurances = counts > 0

In [9]:
# before clustering, let's do some simple analysis 
# and look at how similar the document vectors are,
# and let's look at a distribution of word counts

# first, the distribution on pairwise hamming distances 
# between documents 
def hamming(args):
    i, j = args
    return (occurances[i] != occurances[j]).sum()
hamming_dists = list(map(hamming, it.combinations(range(N), 2)))

In [10]:
hamming_bins = np.bincount(hamming_dists)
plt.bar(range(len(hamming_bins)), hamming_bins)
plt.xlabel('hamming dist (W={})'.format(W))
plt.ylabel('pairwise occurances (N={})'.format(N))
plt.xlim(xmin=0, xmax=W)


Out[10]:
(0, 13649)

In [11]:
# now let's look at the word occurance frequency count
freqs = occurances.sum(axis=0)
freqs = np.array(freqs, dtype=np.float)/float(N)
plt.bar(range(len(freqs)), freqs)
plt.xlabel('word id (W={})'.format(W))
plt.ylabel('frequency of documents containing word')


Out[11]:
<matplotlib.text.Text at 0x11a231f50>

In [14]:
# because of the long tail, we'll do some dimensionality
# reduction-- we'll take the top K words which occur 
# with frequency closest to 1/2
Wtop = 100

dists_from_uniform = map(lambda f: np.abs(f-0.5), freqs)
useful_freqs = list(sorted(enumerate(dists_from_uniform), key=lambda x: x[1]))
useful_features = [idx for idx, _ in useful_freqs[:Wtop]]

In [15]:
# now we'll build a dataset from this subset of features
Y = np.array([tuple(y) for y in occurances[:,useful_features]], dtype=[('', np.bool)]*Wtop)

In [16]:
# build a dataview
view = numpy_dataview(Y)

In [46]:
# we will cluster the NIPS documents with a 
# non-parameteric dirichlet process mixture model
defn = model_definition(N, models=[beta_bernoulli]*Wtop)

# CRP prior alpha
crp_alpha = 100.0

# bb prior
bb_alpha, bb_beta = 1.0, 1.0

In [18]:
# start at a random point in the state space
initargs = {
    'cluster_hp'  : {'alpha':crp_alpha},
    'feature_hps' : [{'alpha':bb_alpha,'beta':bb_beta}]*Wtop,
}
latent = initialize(defn, view, r, **initargs)
model = bind(latent, view)

In [25]:
# set up hyper-parameter inference
indiv_prior_fn = log_exponential(1.2)
hparams = { 
    i : { 
            'alpha' : (indiv_prior_fn, 1.5),
            'beta'  : (indiv_prior_fn, 1.5),
        } 
    for i in xrange(Wtop) 
}

In [47]:
# toss the first burnin samples
burnin = 100
start = time.time()
for _ in xrange(burnin):
    gibbs.assign(model, r)
    slice.hp(model, r, hparams=hparams)
print 'took:', (time.time() - start), 'seconds'


took: 39.1389708519 seconds

In [48]:
# now the group size distribution
group_sizes = [latent.groupsize(gid) for gid in latent.groups()]
group_sizes = sorted(group_sizes, reverse=True)
plt.bar(range(len(group_sizes)), group_sizes)
plt.ylabel('group size')


Out[48]:
<matplotlib.text.Text at 0x12cb61810>

In [ ]:


In [63]:
# top K most popular cluster ids
K = 10
assignments = np.array(latent.assignments(), dtype=np.int)
kv = list(enumerate(np.bincount(assignments)))
kv = sorted(kv, key=lambda x: x[1], reverse=True)
kv = map(lambda x: x[0], kv)
topK = kv[:K]

In [79]:
def features_in_order(cluster):
    idxs = np.where(assignments==cluster)[0]
    freqs = (occurances[:,useful_features])[idxs].sum(axis=0)/float(len(idxs))
    return [x for x in sorted(list(zip(useful_features, freqs)), key=lambda x: x[1], reverse=True)]

def features_in_order_tfidf(cluster):
    idxs = np.where(assignments==cluster)[0]
    tfs = (occurances[:,useful_features])[idxs].sum(axis=0)/float(len(idxs))
    idfs = np.log(float(N) / occurances[:,useful_features].sum(axis=0))
    #idfs = np.log(float(len(idxs)) / ((occurances[:,useful_features])[idxs].sum(axis=0)+1e-5))
    tfidfs = tfs * idfs
    return [x for x in sorted(list(zip(useful_features, tfidfs)), key=lambda x: x[1], reverse=True)]

def cluster_titles(cluster):
    idxs = np.where(assignments==cluster)[0]
    return [titles[i] for i in idxs]

for cid in topK:
    top10words = [wordlist[wid] for wid, _ in features_in_order(cid)[:10]]
    top10words_tfidf = [wordlist[wid] for wid, _ in features_in_order_tfidf(cid)[:10]]
    ts = cluster_titles(cid)
    print "cluster", cid
    print "top 10 words:", top10words
    print "top 10 words (TF-IDF):", top10words_tfidf
    print "titles:"
    for t in ts[:20]:
        print "\t{}".format(t)
    print


cluster 14
top 10 words: [u'hidden', u'weights', u'units', u'layer', u'trained', u'net', u'weight', u'unit', u'task', u'architecture']
top 10 words (TF-IDF): [u'hidden', u'trained', u'layer', u'net', u'units', u'architecture', u'unit', u'weight', u'task', u'weights']
titles:
	Stochastic Learning Networks and their Electronic Implementation
	Centric Models of the Orientation Map in Primary Visual Cortex
	Analysis and Comparison of Different Learning Algorithms for Pattern Association Problems
	Learning Representations by Recirculation
	How Neural Nets Work
	Learning a Color Algorithm from Examples
	Static and Dynamic Error Propagation Networks with Application to Speech Coding
	Teaching Artificial Neural Systems to Drive: Manual Training Techniques for Autonomous Systems
	A 'Neural' Network that Learns to Play Backgammon
	Learning in Networks of Nondeterministic Adaptive Logic Elements
	Constraints on Adaptive Networks for Modeling Human Generalization
	An Optimality Principle for Unsupervised Learning
	Associative Learning via Inhibitory Search
	Efficient Parallel Learning Algorithms for Neural Networks
	Learning by Choice of Internal Representations
	Connectionist Learning of Expert Preferences by Comparison Training
	Skeletonization: A Technique for Trimming the Fat from a Network via Relevance Assessment
	The Boltzmann Perceptron Network: A Multi-Layered Feed-Forward Network Equivalent to the Boltzmann Machine
	Adaptive Neural Net Preprocessing for Signal Detection in Non-Gaussian Noise
	GEMINI: Gradient Estimation Through Matrix Inversion After Noise Injection

cluster 29
top 10 words: [u'constant', u'properties', u'high', u'institute', u'low', u'range', u'inputs', u'current', u'signal', u'level']
top 10 words (TF-IDF): [u'institute', u'properties', u'signal', u'range', u'effect', u'low', u'constant', u'level', u'higher', u'difference']
titles:
	Simulations Suggest Information Processing Roles for the Diverse Currents in Hippocampal Neurons
	Presynaptic Neural Information Processing
	Phase Transitions in Neural Networks
	New Hardware for Massive Neural Networks
	High Density Associative Memories
	Correlational Strength and Computational Algebra of Synaptic Connections Between Neurons
	Schema for Motor Control Utilizing a Network Model of the Cerebellum
	An Optimization Network for Matrix Inversion
	Distributed Neural Information Processing in the Vestibulo-Ocular System
	Spontaneous and Information-Triggered Segments of Series of Human Brain Electric Field Maps
	Programmable Synaptic Chip for Electronic Neural Networks
	A Neural-Network Solution to the Concentrator Assignment Problem
	Neuromorphic Networks Based on Sparse Optical Orthogonal Codes
	Synchronization in Neural Nets
	Neural Approach for TV Image Compression Using a Hopfield Type Network
	Neural Analog Diffusion-Enhancement Layer and Spatio-Temporal Grouping in Early Vision
	Neuronal Maps for Sensory-Motor Control in the Barn Owl
	Modeling Small Oscillating Biological Networks in Analog VLSI
	Storing Covariance by the Associative Long-Term Potentiation and Depression of Synaptic Strengths in the Hippocampus
	Modeling the Olfactory BulbsCoupled Nonlinear Oscillators

cluster 100
top 10 words: [u'units', u'pattern', u'weights', u'representation', u'trained', u'architecture', u'unit', u'presented', u'inputs', u'task']
top 10 words (TF-IDF): [u'representation', u'units', u'trained', u'architecture', u'discussion', u'unit', u'provide', u'task', u'pattern', u'layer']
titles:
	A Computer Simulation of Olfactory Cortex with Functional Implications for Storage and Retrieval of Olfactory Information
	Partitioning of Sensory Data by a Cortical Network
	How the Catfish Tracks Its Prey: An Interactive 'Pipelined' Processing System May Direct Foraging via Reticulospinal Neurons
	Discovering Structure from Motion in Monkey, Man and Machine
	A Computer Simulation of Cerebral Neocortex: Computational Capabilities of Nonlinear Neural Networks
	A Dynamical Approach to Temporal Pattern Processing
	Mapping Classifier Systems Into Neural Networks
	Use of Multi-Layered Networks for Coding Speech with Phonetic Features
	Neural Control of Sensory Acquisition: The Vestibulo-Ocular Reflex
	Learning the Solution to the Aperture Problem for Pattern Motion with a Hebb Rule
	Dynamic, Non-Local Role Bindings and Inferencing in a Localist Network for Natural Language Understanding
	An Adaptive Network That Learns Sequences of Transitions
	Digital Realisation of Self-Organising Maps
	Associative Memory in a Simple Model of Oscillating Cortex ........
	Learning Aspect Graph Representations from View Sequences
	A Self-Organizing Multiple-View Representation of 3D Objects
	Neuronal Group Selection Theory: A Grounding in Robotics
	Operational Fault Tolerance of CMAC Networks
	Incremental Parsing by Modular Recurrent Connectionist Networks
	Rule Representations in a Connectionist Chunker

cluster 53
top 10 words: [u'trained', u'test', u'high', u'layer', u'hidden', u'architecture', u'task', u'size', u'low', u'sets']
top 10 words (TF-IDF): [u'trained', u'layer', u'hidden', u'architecture', u'test', u'sets', u'low', u'task', u'net', u'level']
titles:
	Speech Recognition Experiments with Perceptrons
	High Order Neural Networks for Efficient Associative Memory Design
	A Novel Net that Learns Sequential Decision Process
	Applications of Error Back-Propagation to Phonetic Classification
	Consonant Recognition by Modular Construction of Large Phonemic Time-Delay Neural Networks
	Temporal Representations in a Connectionist Speech System
	ALVINN: An Autonomous Land Vehicle in a Neural Network
	Neural Network Star Pattern Recognition for Spacecraft Attitude Determination and Control
	Learning Sequential Structure in Simple Recurrent Networks
	Practical Characteristics of Neural Network and Conventional Pattern Classifiers on Artificial and Speech Problems
	HMM Speech Recognition with Neural Net Discrimination
	A Neural Network for Real-Time Signal Processing
	TRAFFIC: Recognizing Objects Using Hierarchical Reference Frame Transformations
	Recognizing Hand-Printed Letters and Digits
	A Large-Scale Neural Network Which Recognizes Handwritten Kanji Characters
	A Neural Network to Detect Homologies in Proteins
	Comparing the Performance of Connectionist and Statistical Classifiers on an Image Segmentation Problem
	The Tempo 2 Algorithm: Adjusting Time-Delays By Supervised Learning .
	A Recurrent Neural Network for Word Identification from Continuous Phoneme Strings .
	Spoken Letter Recognition .

cluster 56
top 10 words: [u'weights', u'inputs', u'weight', u'layer', u'sum', u'units', u'unit', u'signal', u'pattern', u'architecture']
top 10 words (TF-IDF): [u'sum', u'weight', u'layer', u'inputs', u'signal', u'weights', u'architecture', u'unit', u'units', u'hidden']
titles:
	Mathematical Analysis of Learning Behavior of Neuronal Models
	Network Generality, Training Required, and Precision Required
	Temporal Patterns of Activity in Neural Networks
	Time-Sequential Self-Organization of Hierarchical Neural Networks
	Strategies for Teaching Layered Networks Classification Tasks
	Speech Production Using A Neural Network with a Cooperative Learning Mechanism
	Backpropagation and Its Application to Handwritten Signature Verification
	Using Backpropagation with Temporal Windows to Learn the Dynamics of the CMU Direct-Drive Arm II
	Training a 3-Node Neural Network is NP-Complete
	Spreading Activation over Distributed Microfeatures
	A Low-Power CMOS Circuit Which Emulates Temporal Electrical Properties of Neurons
	A Programmable Analog Neural Computer and Simulator
	Training a Limited-Interconnect, Synthetic Neural IC
	Acoustic-Imaging Computations by Echolocating Bats: Unification of Diversely-Represented Stimulus Features into Whole Images
	Neural Network Analysis of Distributed Representations of Dynamical Sensory- Motor Transformations in the Leech
	Connectionist Architectures for Multi-Speaker Phoneme Recognition
	The Effects of Circuit Integration on a Feature Map Vector Quantizer
	Using A Translation-Invariant Neural Network to Diagnose Heart Arrhythmia.
	A Self-organizing Associative Memory System for Control Applications
	A Method for the Associative Storage of Analog Vectors

cluster 75
top 10 words: [u'average', u'state', u'constant', u'probability', u'effect', u'term', u'rate', u'range', u'noise', u'parameter']
top 10 words (TF-IDF): [u'effect', u'term', u'noise', u'assume', u'range', u'average', u'equation', u'state', u'discussion', u'constant']
titles:
	Minkowski-r Back-Propagation: Learning in Connectionist Models with Non-Euclidian Error Signals
	A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Artificial Neural Networks
	Optimization by Mean Field Annealing
	Models of Ocular Dominance Column Formation: Analytical and Computational Results
	A Model of Neural Oscillator for a Unified Submodule
	A Self-Learning Neural Network
	Neural Network Simulation of Somatosensory Representational Plasticity
	Non-Boltzmann Dynamics in Networks of Spiking Neurons
	Effects of Firing Synchrony on Signal Propagation in Layered Networks
	Analysis of Linsker's Simulations of Hebbian Rules
	Self-organization of Hebbian Synapses in Hippocampal Neurons .
	Associative Memory in a Network of 'Biological' Neurons .
	Phase-coupling in Two-Dimensional Networks of Interacting Oscillators . . .
	Oscillation Onset in Neural Delayed Feedback .
	Optimal Sampling of Natural Images .
	An Attractor Neural Network Model of Recall and Recognition .
	Stationarity of Synaptic Coupling Strength Between Neurons with Nonstationary Discharge Properties
	Single Neuron Model: Response to Weak Modulation in the Presence of Noise
	Burst Synchronization without Frequency Locking in a Completely Solvable Network Model
	A Neural Network for Motion Detection of Drift-Balanced Stimuli

cluster 8
top 10 words: [u'probability', u'distribution', u'generated', u'vector', u'methods', u'high', u'find', u'algorithms', u'applied', u'method']
top 10 words (TF-IDF): [u'generated', u'probability', u'distribution', u'algorithms', u'solution', u'noise', u'points', u'dimensional', u'find', u'step']
titles:
	Navigating Through Temporal Difference .
	Statistical Reliability of a Blowfly Movement-Sensitive Neuron
	Obstacle Avoidance through Reinforcement Learning
	Planar Hidden Markov Modeling: from Speech to Optical Character Recognition
	Clustering with a Domain-Specific Distance Measure
	Central and Pairwise Data Clustering by Competitive Neural Networks
	Efficient Computation of Complex Distance Metrics Using Hierarchical Filtering
	The Power of Amnesia
	Bayesian Modeling and Classification of Neural Signals
	Decoding Cursive Scripts
	MULTIDIMENSIONAL SCALING AND DATA CLUSTERING
	DETERMINISTIC ANNEALING VARIANT OF THE EM ALGORITHM
	ACIIVE LEARNING FOR FUNCTION APPROXIMATION
	Family Discovery
	Clustering Data through an Analogy to the Potts Model
	Symplectic Nonlinear Component Analysis
	Modeling Saccadic Targeting in Visual Search
	Extraction of Temporal Features in the Electrosensory System of Weakly Electric Fish,
	A Variational Principle for Model-based Morphing,
	MIMIC: Finding Optima by Estimating Probability Densities,

cluster 18
top 10 words: [u'test', u'method', u'size', u'sets', u'experiments', u'methods', u'vector', u'obtained', u'algorithms', u'computer']
top 10 words (TF-IDF): [u'sets', u'test', u'experiments', u'size', u'good', u'algorithms', u'examples', u'method', u'provide', u'methods']
titles:
	An Artificial Neural Network for Spatio-Temporal Bipolar Patterns: Application to Phoneme Classification
	Optimal Neural Spike Classification
	A Continuous Speech Recognition System Embedding MLP into HMM
	Bayesian Inference of Regular Grammar and Markov Source Models
	Using Genetic Algorithms to Improve Pattern Classification Performance .
	Multimodular Architecture for Remote Sensing Options
	Hidden Markov Model Induction by Bayesian Model Merging
	Efficient Pattern Recognition Using a New Transformation Distance
	A Boundary Hunting Radial Basis Function Classifier which Allocates Centers Constructively
	A Note on Learning Vector Quantization
	Information, Prediction, and Query by Committee
	Locally Adaptive Nearest Neighbor Algorithms
	Cross-Validation Estimates IMSE
	The Statistical Mechanics of k-Satisfaction
	Learning Complex Boolean Functions: Algorithms and Applications
	CLASSIFYING WITH GAUSSIAN MIXTURES AND CLUSTERS
	NEW ALGORITHMS FOR 2D AND 3D POINT MATCHING: POSE ESTIMATION AND CORRESPONDENCE
	Estimating the Bayes Risk from Sample Data
	Geometry of Early Stopping in Linear Networks
	Boosting Decision Trees

cluster 94
top 10 words: [u'functions', u'theory', u'defined', u'weights', u'constant', u'fixed', u'called', u'assume', u'section', u'real']
top 10 words (TF-IDF): [u'called', u'assume', u'net', u'theory', u'fact', u'functions', u'constant', u'layer', u'defined', u'computational']
titles:
	Supervised Learning of Probability Distributions by Neural Networks
	The Capacity of the Kanerva Associative Memory is Exponential
	Linear Learning: Landscapes and Algorithms
	What Size Net Gives Valid Generalization?
	The Effect of Catecholamines on Performance: From Unit to System Behavior
	The Perceptron Algorithm Is Fast for Non-Malicious Distributions
	Analog Neural Networks of Limited Precision I: Computing with Multilinear Threshold Functions
	Computing with Arrays of Bell-Shaped and Sigrnoid Functions .
	Discrete Affine Wavelet Transforms .
	Extensions of a Theory of Networks for Approximation and Learning .
	Efficient Design of Boltzmann Machines .
	The Devil and the Network .
	Remarks on Interpolation and Recognition Using Neural Nets .
	e-Entropy and the Complexity of Feedforward Neural Networks .
	On The Circuit Complexity of Neural Networks .
	Estimating Average-Case Learning Curves Using Bayesian, Statistical Physics and VC Dimension Methods
	Threshold Network Learning in the Presence of Equivalences
	Neural Computing with Small Weights
	Splines, Rational Functions and Neural Networks
	Computing with Almost Optimal Size Neural Networks

cluster 60
top 10 words: [u'state', u'step', u'current', u'algorithms', u'problems', u'optimal', u'defined', u'probability', u'methods', u'random']
top 10 words (TF-IDF): [u'optimal', u'algorithms', u'mit', u'state', u'step', u'solution', u'current', u'equation', u'called', u'assume']
titles:
	Sequential Decision Problems and Neural Networks
	Learning Time-varying Concepts .
	Integrated Modeling and Control Based on Reinforcement Learning .
	Simulation of Optimal Movements Using the Minimum-Muscle-Tension-Change Model
	Incrementally Learning Time-varying Half-planes
	Memory-based Reinforcement Learning: Efficient Computation with Prioritized Sweeping
	When Will a Genetic Algorithm Outperform Hill Climbing?
	Monte Carlo Matrix Inversion and Reinforcement Learning
	Convergence of Indirect Adaptive Asynchronous Value Iteration Algorithms
	Convergence of Stochastic Iterative Dynamic Programming Algorithms
	REINFORCEMENT LEARNING ALGORITHM FOR PARTIALLY OBSERVABLE MARKOV DECISION PROBLEMS
	ADVANTAGE UPDATING APPLIED TO A DIFFERENTIAL GAME
	REINFORCEMENT LEARNING W1TH SOFT STATE AGGREGATION
	INSTANCE-BASED STATE IDENTIFICATION FOR REINFORCEMENT LEARNING
	AN ACTOR/CRITIC ALGORITHM THAT IS EQUIVALENT TO Q-LEARNING
	INTERIOR POINT IMPLEMENTATIONS OF ALTERNATING MINIMIZATION TRAINING
	Predictive Q-Routing: A Memory-based Reinforcement Learning Approach to Adaptive Traffic Control
	Parallel Optimization of Motion Controllers via Policy Iteration
	Stable Linear Approximations to Dynamic Programming for Stochastic Control Problems with Local Transitions
	Stable Fitted Reinforcement Learning


In [ ]: