In [3]:
from vsm import *
from vsm.extensions.corpusbuilders import toy_corpus


plain_corpus = """
His theology challenged the Pope of the Roman Catholic Church by
teaching that the Bible is the only source of divinely revealed
knowledge.

Augustine is held in the Catholic Church to be the model teacher.

Augustine was recognized as a Doctor of the Church by Pope Boniface
VIII.

Roman Catholic theology stated that faith alone cannot justify man.

In the Catholic Church the Pope is regarded as the successor of Saint
Peter.

Alonzo Church was an American mathematician and logician who made
major contributions to mathematical logic and the foundations of
theoretical computer science.

The lambda calculus was introduced by mathematician Alonzo Church as
an investigation into the foundations of mathematics.

The Church Turing thesis states that a function is algorithmically
computable if and only if it is computable by a Turing machine.

Mathematical logic has close connections to the foundations of
mathematics, theoretical computer science.

A Turing machine can be adapted to simulate the logic of any computer
algorithm.
"""

metadata = ['Ecclesiastical ' + str(i) for i in xrange(1, 6)]

metadata += ['Logic ' + str(i) for i in xrange(1, 6)]

c = toy_corpus(plain_corpus, nltk_stop=True, metadata=metadata)

m = LDA(c, context_type='document', K=2)

m.train(n_iterations=200)

v = LdaCgsViewer(c, m)


Begin LDA training for 200 iterations
Iteration 199 complete: log_prob=-292.814182029, time=0.0

In [4]:
v.topics(compact_view=False)


Out[4]:
Topics Sorted by Index
Topic 0Topic 1
Word Prob Word Prob
logic0.06070catholic0.09879
turing0.06070church0.09879
foundations0.06070pope0.07416
church0.06070theology0.04952
computer0.06070augustine0.04952
computable0.04053roman0.04952
mathematical0.04053model0.02488
mathematician0.04053man0.02488
mathematics0.04053bible0.02488
science0.04053boniface0.02488

In [5]:
v.doc_topics(dict(document_label='Logic 1'))


Out[5]:
Document: Logic 1
Topic Prob
0 0.99929
1 0.00071

In [6]:
v.dist_top_top([0,1], weights=[.6, .4])


Out[6]:
Sorted by Topic Distance
TopicWords
0 logic, turing, foundations, church, computer, computable, mathematical, mathematician, mathematics, science
1 catholic, church, pope, theology, augustine, roman, model, man, bible, boniface

In [7]:
v.dist_doc_doc(dict(document_label='Ecclesiastical 3'))


Out[7]:
Documents:
Document Distance
Ecclesiastical 3 0.00000
Ecclesiastical 5 0.00000
Ecclesiastical 4 0.00207
Ecclesiastical 2 0.00256
Ecclesiastical 1 0.00754
Logic 5 0.99221
Logic 4 0.99297
Logic 2 0.99297
Logic 3 0.99325
Logic 1 0.99398

In [8]:
v.dist_top_doc([0,1])


Out[8]:
Topics: 0, 1
Document Distance
Ecclesiastical 2 0.55111
Logic 5 0.55195
Ecclesiastical 3 0.55195
Ecclesiastical 5 0.55195
Ecclesiastical 4 0.55259
Logic 2 0.55310
Logic 4 0.55310
Logic 3 0.55351
Ecclesiastical 1 0.55415
Logic 1 0.55462

In [9]:
v.dist_word_top('church')


Out[9]:
Sorted by Topic Distance
TopicWords
1 catholic, church, pope, theology, augustine, roman, model, man, bible, boniface
0 logic, turing, foundations, church, computer, computable, mathematical, mathematician, mathematics, science

In [10]:
v.dist_word_top('logic')


Out[10]:
Sorted by Topic Distance
TopicWords
0 logic, turing, foundations, church, computer, computable, mathematical, mathematician, mathematics, science

In [11]:
v.dismat_top()


Out[11]:
IndexedSymmArray([[ 0.        ,  0.93196539],
       [ 0.93196539,  0.        ]])

In [12]:
v.dismat_top().labels


Out[12]:
['0', '1']

In [13]:
np.around(v.dismat_doc(), decimals=1)


Out[13]:
array([[ 0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.]])

In [14]:
v.dismat_doc().labels


Out[14]:
array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], 
      dtype='|S1')

In [15]:
%matplotlib inline

In [16]:
p1 = v.logp_plot()



In [17]:
p2 = v.topic_hist()