In [1]:
from vsm import *
from vsm.extensions.corpusbuilders import toy_corpus


plain_corpus = """
His theology challenged the Pope of the Roman Catholic Church by
teaching that the Bible is the only source of divinely revealed
knowledge.

Augustine is held in the Catholic Church to be the model teacher.

Augustine was recognized as a Doctor of the Church by Pope Boniface
VIII.

Roman Catholic theology stated that faith alone cannot justify man.

In the Catholic Church the Pope is regarded as the successor of Saint
Peter.

Alonzo Church was an American mathematician and logician who made
major contributions to mathematical logic and the foundations of
theoretical computer science.

The lambda calculus was introduced by mathematician Alonzo Church as
an investigation into the foundations of mathematics.

The Church Turing thesis states that a function is algorithmically
computable if and only if it is computable by a Turing machine.

Mathematical logic has close connections to the foundations of
mathematics, theoretical computer science.

A Turing machine can be adapted to simulate the logic of any computer
algorithm.
"""

metadata = ['Ecclesiastical ' + str(i) for i in xrange(1, 6)]
metadata += ['Logic ' + str(i) for i in xrange(1, 6)]

c = toy_corpus(plain_corpus, nltk_stop=True, metadata=metadata)

tf_m = TfMulti(c, 'document')
tf_m.train(2)

tfidf_m = TfIdf(tf_m.matrix, 'document')
tfidf_m.train()

lsa_m = Lsa(tfidf_m.matrix, 'document')
lsa_m.train(k_factors=3)

tf_v = TfViewer(c, tf_m)
tfidf_v = TfIdfViewer(c, tfidf_m)
lsa_v = LsaViewer(c, lsa_m)


Mapping
/usr/lib/pymodules/python2.7/numpy/ctypeslib.py:411: RuntimeWarning: Item size computed from the PEP 3118 buffer format string does not match the actual item size.
  return array(obj, copy=False)
Reducing

In [2]:
tf_v.dist_word_word('logic', print_len=24)


Out[2]:
Words: logic
Word Distance Word Distance
logic 0.00000 close 0.95532
computer 0.00000 adapted 0.95532
theoretical 0.61548 algorithm 0.95532
science 0.61548 american 0.95532
mathematical 0.61548 mathematician 1.15026
foundations 0.84107 alonzo 1.15026
simulate 0.95532 machine 1.15026
major 0.95532 mathematics 1.15026
made 0.95532 turing 1.30964
logician 0.95532 church 1.35081
contributions 0.95532 pope 1.57080
connections 0.95532 recognized 1.57080

In [3]:
tfidf_v.dist_word_word('logic', print_len=24)


Out[3]:
Words: logic
Word Distance Word Distance
logic 0.00000 close 0.95532
computer 0.00000 adapted 0.95532
theoretical 0.61548 algorithm 0.95532
science 0.61548 american 0.95532
mathematical 0.61548 mathematician 1.15026
foundations 0.84107 alonzo 1.15026
simulate 0.95532 machine 1.15026
major 0.95532 mathematics 1.15026
made 0.95532 turing 1.30964
logician 0.95532 church 1.35081
contributions 0.95532 pope 1.57080
connections 0.95532 recognized 1.57080

In [4]:
lsa_v.dist_word_word('logic', print_len=24)


Out[4]:
Words: logic
Word Distance Word Distance
logic 0.00000 contributions 0.16133
computer 0.00000 major 0.16133
connections 0.14615 alonzo 0.16665
close 0.14615 mathematician 0.16665
theoretical 0.15619 calculus 0.18072
science 0.15619 introduced 0.18072
mathematical 0.15619 lambda 0.18072
mathematics 0.16088 investigation 0.18072
foundations 0.16111 church 0.85232
made 0.16133 simulate 0.94910
american 0.16133 adapted 0.94910
logician 0.16133 algorithm 0.94910

In [5]:
tf_v.dist_doc_doc('Ecclesiastical 4')


Out[5]:
Documents:
Document Distance
Ecclesiastical 4 0.00000
Ecclesiastical 1 1.25961
Ecclesiastical 2 1.42595
Ecclesiastical 5 1.43676
Ecclesiastical 3 1.57080
Logic 1 1.57080
Logic 2 1.57080
Logic 3 1.57080
Logic 4 1.57080
Logic 5 1.57080

In [6]:
tfidf_v.dist_doc_doc('Ecclesiastical 4')


Out[6]:
Documents:
Document Distance
Ecclesiastical 4 0.00000
Ecclesiastical 1 1.41228
Ecclesiastical 2 1.53742
Ecclesiastical 5 1.54051
Ecclesiastical 3 1.57080
Logic 1 1.57080
Logic 2 1.57080
Logic 3 1.57080
Logic 4 1.57080
Logic 5 1.57080

In [7]:
tfidf_v.dist_word_doc('computer')


Out[7]:
Words:
Document Distance
Logic 5 1.32242
Logic 4 1.32915
Logic 1 1.38815
Ecclesiastical 1 1.57080
Ecclesiastical 2 1.57080
Ecclesiastical 3 1.57080
Ecclesiastical 4 1.57080
Ecclesiastical 5 1.57080
Logic 2 1.57080
Logic 3 1.57080

In [8]:
lsa_v.dist_word_doc('church')


Out[8]:
Words:
Document Distance
Ecclesiastical 2 0.73633
Ecclesiastical 3 0.75591
Ecclesiastical 5 0.78818
Ecclesiastical 1 0.83776
Logic 5 0.84555
Ecclesiastical 4 0.84617
Logic 4 0.94199
Logic 2 0.94433
Logic 1 0.94556
Logic 3 1.11314

In [9]:
lsa_v.dist_doc_doc('Ecclesiastical 4')


Out[9]:
Documents:
Document Distance
Ecclesiastical 4 0.00000
Ecclesiastical 1 0.00843
Ecclesiastical 5 0.05804
Ecclesiastical 3 0.09036
Ecclesiastical 2 0.10991
Logic 2 1.56321
Logic 3 1.57501
Logic 1 1.57808
Logic 4 1.58446
Logic 5 1.58765

In [10]:
tf_v.coll_freq('church')


Out[10]:
7

In [11]:
tf_v.coll_freqs(print_len=54)


Out[11]:
Collection Frequencies
Word Counts Word Counts Word Counts Word Counts
church 7 computable 2 calculus 1 states 1
catholic 4 augustine 2 american 1 stated 1
foundations 3 roman 2 close 1 source 1
turing 3 science 2 algorithmically 1 simulate 1
computer 3 alone 1 held 1 saint 1
pope 3 contributions 1 function 1 revealed 1
logic 3 algorithm 1 viii 1 regarded 1
alonzo 2 faith 1 introduced 1 recognized 1
mathematics 2 doctor 1 investigation 1 peter 1
theoretical 2 divinely 1 thesis 1 model 1
machine 2 bible 1 teaching 1 man 1
theology 2 boniface 1 teacher 1 major 1
mathematical 2 connections 1 successor 1 made 1
mathematician 2 challenged 1

In [12]:
dm = tf_v.dismat_word(['logic','church','catholic','pope'])
np.around(dm, decimals=2)


Out[12]:
array([[ 0.  ,  1.35,  1.57,  1.57],
       [ 1.35,  0.  ,  0.97,  0.86],
       [ 1.57,  0.97,  0.  ,  0.96],
       [ 1.57,  0.86,  0.96,  0.  ]])

In [13]:
dm = tfidf_v.dismat_word(['logic','church','catholic','pope'])
np.around(dm, decimals=2)


Out[13]:
array([[ 0.  ,  1.35,  1.57,  1.57],
       [ 1.35,  0.  ,  0.97,  0.86],
       [ 1.57,  0.97,  0.  ,  0.96],
       [ 1.57,  0.86,  0.96,  0.  ]])

In [14]:
dm = lsa_v.dismat_word(['logic','church','catholic','pope'])
np.around(dm, decimals=2)


Out[14]:
array([[ 0.  ,  0.85,  1.57,  1.57],
       [ 0.85,  0.  ,  0.87,  0.87],
       [ 1.57,  0.87,  0.  ,  0.01],
       [ 1.57,  0.87,  0.01,  0.  ]])

In [15]:
dm = tf_v.dismat_doc(['Logic 1','Logic 2','Ecclesiastical 1','Ecclesiastical 2'])
np.around(dm, decimals=2)


Out[15]:
array([[ 0.  ,  1.21,  1.49,  1.46],
       [ 1.21,  0.  ,  1.47,  1.43],
       [ 1.49,  1.47,  0.  ,  1.33],
       [ 1.46,  1.43,  1.33,  0.  ]])

In [16]:
dm = tfidf_v.dismat_doc(['Logic 1','Logic 2','Ecclesiastical 1','Ecclesiastical 2'])
np.around(dm, decimals=2)


Out[16]:
array([[ 0.  ,  1.39,  1.57,  1.57],
       [ 1.39,  0.  ,  1.57,  1.57],
       [ 1.57,  1.57,  0.  ,  1.54],
       [ 1.57,  1.57,  1.54,  0.  ]])

In [17]:
dm = lsa_v.dismat_doc(['Logic 1','Logic 2','Ecclesiastical 1','Ecclesiastical 2'])
np.around(dm, decimals=2)


Out[17]:
array([[ 0.  ,  0.02,  1.57,  1.49],
       [ 0.02,  0.  ,  1.56,  1.48],
       [ 1.57,  1.56,  0.  ,  0.1 ],
       [ 1.49,  1.48,  0.1 ,  0.  ]])

In [18]:
from vsm.spatial import scipy_cdist
lsa_v.dist_word_word('logic', print_len=24, dist_fn=scipy_cdist(metric='minkowski'))


Out[18]:
Words: logic
Word Distance Word Distance
logic 0.00000 theoretical 0.06325
computer 0.00000 mathematics 0.07048
mathematician 0.04921 connections 0.09843
alonzo 0.04921 close 0.09843
foundations 0.05166 introduced 0.12865
american 0.05320 lambda 0.12865
contributions 0.05320 investigation 0.12865
logician 0.05320 calculus 0.12865
made 0.05320 church 0.18155
major 0.05320 simulate 0.19473
mathematical 0.06325 adapted 0.19473
science 0.06325 algorithm 0.19473

In [19]:
dm = lsa_v.dismat_doc(['Logic 1','Logic 2','Ecclesiastical 1','Ecclesiastical 2'], dist_fn=scipy_cdist(metric='cosine'))
np.around(dm, decimals=2)


Out[19]:
array([[ 0.  ,  0.  ,  1.  ,  0.92],
       [ 0.  ,  0.  ,  0.99,  0.91],
       [ 1.  ,  0.99,  0.  ,  0.01],
       [ 0.92,  0.91,  0.01,  0.  ]])