In [38]:
import sys
sys.path.reverse()
from gensim import corpora, models, similarities

In [51]:
open('dogs.txt').readlines()


Out[51]:
['I may not be perfect by my dog loves me\n',
 'If there are no dogs in heaven then when I die I want to go where they go\n',
 'Keep calm and walk the dog\n',
 'bark less, wag more\n',
 'Happiness is a warm puppy\n',
 'The average dog is a nicer person than the average person\n']

In [39]:
#learn words
dictionary = corpora.Dictionary(line.decode('ascii','ignore').lower().split() for line in open('cats.txt').readlines())

In [40]:
dictionary.token2id


Out[40]:
{'-': 23,
 'a': 0,
 'actress': 40,
 'an': 24,
 'are': 1,
 'as': 25,
 'be': 26,
 'better,': 27,
 'can': 47,
 'cat': 14,
 'cats': 2,
 'cause': 48,
 'equal': 28,
 'even': 29,
 'everything': 3,
 'exact': 49,
 'get': 30,
 'good': 41,
 'has': 4,
 'in': 5,
 'inconvenience': 50,
 'insane,': 42,
 'intended': 6,
 'is': 18,
 'it': 31,
 'itself': 32,
 'just': 43,
 'knows': 33,
 'mathematically': 51,
 'most': 52,
 'my': 44,
 'nature': 7,
 'never': 15,
 'not': 8,
 'on': 34,
 'or': 35,
 'out': 53,
 'outstubborn': 16,
 'place': 54,
 'purpose': 9,
 'really': 45,
 "she's": 46,
 'sit': 55,
 'spent': 19,
 'superior': 36,
 'teach': 10,
 'that': 11,
 'the': 37,
 'time': 20,
 'to': 12,
 'treat': 38,
 'try': 17,
 'us': 13,
 'wasted': 21,
 'way': 39,
 'will': 56,
 'with': 22,
 'work': 57}

In [42]:
len(dictionary)


Out[42]:
58

In [43]:
#learn some more words 
dictionary.add_documents(line.lower().split() for line in open('dogs.txt').readlines())

In [44]:
dictionary.token2id


Out[44]:
{'-': 23,
 'a': 0,
 'actress': 40,
 'an': 24,
 'and': 77,
 'are': 1,
 'as': 25,
 'average': 88,
 'bark': 81,
 'be': 26,
 'better,': 27,
 'by': 58,
 'calm': 78,
 'can': 47,
 'cat': 14,
 'cats': 2,
 'cause': 48,
 'die': 65,
 'dog': 59,
 'dogs': 66,
 'equal': 28,
 'even': 29,
 'everything': 3,
 'exact': 49,
 'get': 30,
 'go': 67,
 'good': 41,
 'happiness': 85,
 'has': 4,
 'heaven': 68,
 'i': 60,
 'if': 69,
 'in': 5,
 'inconvenience': 50,
 'insane,': 42,
 'intended': 6,
 'is': 18,
 'it': 31,
 'itself': 32,
 'just': 43,
 'keep': 79,
 'knows': 33,
 'less,': 82,
 'loves': 61,
 'mathematically': 51,
 'may': 62,
 'me': 63,
 'more': 83,
 'most': 52,
 'my': 44,
 'nature': 7,
 'never': 15,
 'nicer': 89,
 'no': 70,
 'not': 8,
 'on': 34,
 'or': 35,
 'out': 53,
 'outstubborn': 16,
 'perfect': 64,
 'person': 90,
 'place': 54,
 'puppy': 86,
 'purpose': 9,
 'really': 45,
 "she's": 46,
 'sit': 55,
 'spent': 19,
 'superior': 36,
 'teach': 10,
 'than': 91,
 'that': 11,
 'the': 37,
 'then': 71,
 'there': 72,
 'they': 73,
 'time': 20,
 'to': 12,
 'treat': 38,
 'try': 17,
 'us': 13,
 'wag': 84,
 'walk': 80,
 'want': 74,
 'warm': 87,
 'wasted': 21,
 'way': 39,
 'when': 75,
 'where': 76,
 'will': 56,
 'with': 22,
 'work': 57}

In [45]:
len(dictionary)


Out[45]:
92

In [46]:
#make a corpus of dog phrases only
corpusDog=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('dogs.txt').readlines()[0:20000]]

In [49]:
print(len(corpusDog))
print(corpusDog)


6
[[(8, 1), (26, 1), (44, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1)], [(1, 1), (5, 1), (12, 1), (60, 2), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1)], [(37, 1), (59, 1), (77, 1), (78, 1), (79, 1), (80, 1)], [(81, 1), (82, 1), (83, 1), (84, 1)], [(0, 1), (18, 1), (85, 1), (86, 1), (87, 1)], [(0, 1), (18, 1), (37, 2), (59, 1), (88, 2), (89, 1), (90, 2), (91, 1)]]

In [60]:
#latent semantic index model http://en.wikipedia.org/wiki/Latent_semantic_indexing
lsiDog = models.LsiModel(corpusDog, id2word=dictionary,num_topics=1000)
lsiDog.print_topics(1000)


Out[60]:
['0.452*"i" + 0.417*"go" + 0.208*"to" + 0.208*"heaven" + 0.208*"when" + 0.208*"there" + 0.208*"they" + 0.208*"then" + 0.208*"no" + 0.208*"where"',
 '0.501*"the" + 0.444*"average" + 0.444*"person" + 0.309*"dog" + 0.256*"a" + 0.256*"is" + 0.222*"than" + 0.222*"nicer" + 0.057*"and" + 0.057*"keep"',
 '-0.310*"be" + -0.310*"not" + -0.310*"my" + -0.310*"by" + -0.310*"loves" + -0.310*"may" + -0.310*"me" + -0.310*"perfect" + -0.299*"dog" + -0.210*"i"',
 '-0.355*"walk" + -0.355*"and" + -0.355*"calm" + -0.355*"keep" + 0.274*"is" + 0.274*"a" + -0.262*"the" + -0.237*"dog" + 0.228*"happiness" + 0.228*"puppy"',
 '-0.397*"warm" + -0.397*"puppy" + -0.397*"happiness" + -0.280*"is" + -0.280*"a" + -0.235*"walk" + -0.235*"and" + -0.235*"calm" + -0.235*"keep" + 0.234*"average"',
 '-0.500*"wag" + -0.500*"more" + -0.500*"less," + -0.500*"bark" + 0.000*"happiness" + 0.000*"warm" + 0.000*"puppy" + 0.000*"a" + 0.000*"is" + -0.000*"person"']

In [52]:
corpusCat=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('cats.txt').readlines()[0:20000]]
lsiCat = models.LsiModel(corpusCat, id2word=dictionary,num_topics=1000)
lsiCat.print_topics(1000)


Out[52]:
['0.510*"to" + 0.316*"the" + 0.286*"it" + 0.286*"as" + 0.234*"a" + 0.207*"cat" + 0.183*"is" + 0.164*"with" + 0.143*"superior" + 0.143*"knows"',
 '0.334*"that" + 0.334*"cats" + 0.201*"not" + 0.167*"will" + 0.167*"cause" + 0.167*"exact" + 0.167*"place" + 0.167*"out" + 0.167*"most" + 0.167*"mathematically"',
 '0.384*"a" + 0.301*"cat" + 0.250*"not" + 0.238*"is" + -0.211*"the" + 0.168*"never" + 0.167*"really" + 0.167*"actress" + 0.167*"good" + 0.167*"just"',
 '0.252*"cat" + 0.212*"is" + -0.204*"everything" + -0.204*"in" + -0.204*"are" + -0.204*"teach" + -0.204*"intended" + -0.204*"us" + -0.204*"purpose" + -0.204*"has"',
 '-0.425*"never" + -0.255*"spent" + -0.255*"wasted" + -0.255*"time" + 0.232*"just" + 0.232*"good" + 0.232*"really" + 0.232*"she\'s" + 0.232*"actress" + 0.232*"insane,"',
 '0.439*"try" + 0.439*"outstubborn" + -0.305*"with" + 0.304*"to" + -0.287*"is" + -0.281*"time" + -0.281*"spent" + -0.281*"wasted" + 0.158*"never" + 0.152*"cat"']

In [56]:
aPhrase=r'''let me be the person my dog thinks I am'''

In [57]:
vecPhrase = dictionary.doc2bow(aPhrase.lower().split())
veclsiDog= lsiDog[vecPhrase] # convert the query to LSI space

In [58]:
from operator import itemgetter
sorted(veclsiDog, key=itemgetter(1),reverse=True)


Out[58]:
[(1, 1.3439910638392967),
 (0, 0.64408113020523494),
 (4, 0.22114439319510493),
 (3, -0.14019545357926885),
 (2, -1.2689180877211235)]

In [59]:
veclsiCat= lsiCat[vecPhrase] # convert the query to LSI space
sorted(veclsiCat, key=itemgetter(1),reverse=True)


Out[59]:
[(0, 0.47756128610624737),
 (4, 0.34151377038310221),
 (3, 0.19470745296967404),
 (1, -0.023292724700136415),
 (5, -0.070387683050306404),
 (2, -0.078009803158084479)]

In [ ]: