In [40]:
import sys
sys.path.reverse()
from gensim import corpora, models, similarities

In [41]:
open('dogs.txt').readlines()


Out[41]:
['I may not be perfect by my dog loves me\n',
 'If there are no dogs in heaven then when I die I want to go where they go\n',
 'Keep calm and walk the dog\n',
 'bark less, wag more\n',
 'Happiness is a warm puppy\n',
 'The average dog is a nicer person than the average person\n']

In [42]:
#learn words
dictionary = corpora.Dictionary(line.decode('ascii','ignore').lower().split() for line in open('cats.txt').readlines())

In [43]:
dictionary.token2id


Out[43]:
{'-': 23,
 'a': 0,
 'actress': 40,
 'an': 24,
 'are': 1,
 'as': 25,
 'be': 26,
 'better,': 27,
 'can': 47,
 'cat': 14,
 'cats': 2,
 'cause': 48,
 'equal': 28,
 'even': 29,
 'everything': 3,
 'exact': 49,
 'get': 30,
 'good': 41,
 'has': 4,
 'in': 5,
 'inconvenience': 50,
 'insane,': 42,
 'intended': 6,
 'is': 18,
 'it': 31,
 'itself': 32,
 'just': 43,
 'knows': 33,
 'mathematically': 51,
 'most': 52,
 'my': 44,
 'nature': 7,
 'never': 15,
 'not': 8,
 'on': 34,
 'or': 35,
 'out': 53,
 'outstubborn': 16,
 'place': 54,
 'purpose': 9,
 'really': 45,
 "she's": 46,
 'sit': 55,
 'spent': 19,
 'superior': 36,
 'teach': 10,
 'that': 11,
 'the': 37,
 'time': 20,
 'to': 12,
 'treat': 38,
 'try': 17,
 'us': 13,
 'wasted': 21,
 'way': 39,
 'will': 56,
 'with': 22,
 'work': 57}

In [44]:
len(dictionary)


Out[44]:
58

In [45]:
#learn some more words 
dictionary.add_documents(line.lower().split() for line in open('dogs.txt').readlines())

In [46]:
dictionary.token2id


Out[46]:
{'-': 23,
 'a': 0,
 'actress': 40,
 'an': 24,
 'and': 77,
 'are': 1,
 'as': 25,
 'average': 88,
 'bark': 81,
 'be': 26,
 'better,': 27,
 'by': 58,
 'calm': 78,
 'can': 47,
 'cat': 14,
 'cats': 2,
 'cause': 48,
 'die': 65,
 'dog': 59,
 'dogs': 66,
 'equal': 28,
 'even': 29,
 'everything': 3,
 'exact': 49,
 'get': 30,
 'go': 67,
 'good': 41,
 'happiness': 85,
 'has': 4,
 'heaven': 68,
 'i': 60,
 'if': 69,
 'in': 5,
 'inconvenience': 50,
 'insane,': 42,
 'intended': 6,
 'is': 18,
 'it': 31,
 'itself': 32,
 'just': 43,
 'keep': 79,
 'knows': 33,
 'less,': 82,
 'loves': 61,
 'mathematically': 51,
 'may': 62,
 'me': 63,
 'more': 83,
 'most': 52,
 'my': 44,
 'nature': 7,
 'never': 15,
 'nicer': 89,
 'no': 70,
 'not': 8,
 'on': 34,
 'or': 35,
 'out': 53,
 'outstubborn': 16,
 'perfect': 64,
 'person': 90,
 'place': 54,
 'puppy': 86,
 'purpose': 9,
 'really': 45,
 "she's": 46,
 'sit': 55,
 'spent': 19,
 'superior': 36,
 'teach': 10,
 'than': 91,
 'that': 11,
 'the': 37,
 'then': 71,
 'there': 72,
 'they': 73,
 'time': 20,
 'to': 12,
 'treat': 38,
 'try': 17,
 'us': 13,
 'wag': 84,
 'walk': 80,
 'want': 74,
 'warm': 87,
 'wasted': 21,
 'way': 39,
 'when': 75,
 'where': 76,
 'will': 56,
 'with': 22,
 'work': 57}

In [47]:
len(dictionary)


Out[47]:
92

In [48]:
#make a corpus of dog phrases only
corpusDog=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('dogs.txt').readlines()[0:20000]]

In [49]:
print(len(corpusDog))
print(corpusDog)


6
[[(8, 1), (26, 1), (44, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1)], [(1, 1), (5, 1), (12, 1), (60, 2), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1)], [(37, 1), (59, 1), (77, 1), (78, 1), (79, 1), (80, 1)], [(81, 1), (82, 1), (83, 1), (84, 1)], [(0, 1), (18, 1), (85, 1), (86, 1), (87, 1)], [(0, 1), (18, 1), (37, 2), (59, 1), (88, 2), (89, 1), (90, 2), (91, 1)]]

In [50]:
#latent semantic index model http://en.wikipedia.org/wiki/Latent_semantic_indexing
lsiDog = models.LsiModel(corpusDog, id2word=dictionary,num_topics=1000)
lsiDog.print_topics(1000)


Out[50]:
['0.452*"i" + 0.417*"go" + 0.208*"want" + 0.208*"no" + 0.208*"die" + 0.208*"there" + 0.208*"if" + 0.208*"heaven" + 0.208*"they" + 0.208*"then"',
 '0.501*"the" + 0.444*"person" + 0.444*"average" + 0.309*"dog" + 0.256*"a" + 0.256*"is" + 0.222*"than" + 0.222*"nicer" + 0.057*"keep" + 0.057*"and"',
 '-0.310*"not" + -0.310*"be" + -0.310*"may" + -0.310*"me" + -0.310*"my" + -0.310*"by" + -0.310*"loves" + -0.310*"perfect" + -0.299*"dog" + -0.210*"i"',
 '0.355*"walk" + 0.355*"keep" + 0.355*"and" + 0.355*"calm" + -0.274*"a" + -0.274*"is" + 0.262*"the" + 0.237*"dog" + -0.228*"puppy" + -0.228*"warm"',
 '-0.397*"warm" + -0.397*"puppy" + -0.397*"happiness" + -0.280*"is" + -0.280*"a" + -0.235*"walk" + -0.235*"calm" + -0.235*"and" + -0.235*"keep" + 0.234*"person"',
 '0.500*"wag" + 0.500*"more" + 0.500*"less," + 0.500*"bark" + -0.000*"warm" + -0.000*"happiness" + -0.000*"puppy" + -0.000*"is" + -0.000*"a" + 0.000*"person"']

In [51]:
corpusCat=[dictionary.doc2bow(line.decode('ascii','ignore').lower().split()) for line in open('cats.txt').readlines()[0:20000]]
lsiCat = models.LsiModel(corpusCat, id2word=dictionary,num_topics=1000)
lsiCat.print_topics(1000)


Out[51]:
['0.510*"to" + 0.316*"the" + 0.286*"as" + 0.286*"it" + 0.234*"a" + 0.207*"cat" + 0.183*"is" + 0.164*"with" + 0.143*"way" + 0.143*"better,"',
 '0.334*"that" + 0.334*"cats" + 0.201*"not" + 0.167*"can" + 0.167*"work" + 0.167*"will" + 0.167*"sit" + 0.167*"place" + 0.167*"mathematically" + 0.167*"out"',
 '0.384*"a" + 0.301*"cat" + 0.250*"not" + 0.238*"is" + -0.211*"the" + 0.168*"never" + 0.167*"she\'s" + 0.167*"good" + 0.167*"insane," + 0.167*"just"',
 '0.252*"cat" + 0.212*"is" + -0.204*"are" + -0.204*"has" + -0.204*"purpose" + -0.204*"nature" + -0.204*"us" + -0.204*"intended" + -0.204*"teach" + -0.204*"everything"',
 '-0.425*"never" + -0.255*"wasted" + -0.255*"spent" + -0.255*"time" + 0.232*"just" + 0.232*"she\'s" + 0.232*"actress" + 0.232*"good" + 0.232*"insane," + 0.232*"my"',
 '0.439*"try" + 0.439*"outstubborn" + -0.305*"with" + 0.304*"to" + -0.287*"is" + -0.281*"spent" + -0.281*"wasted" + -0.281*"time" + 0.158*"never" + 0.152*"cat"']

In [52]:
aPhrase=r'''let me be the person my dog thinks I am'''

In [53]:
vecPhrase = dictionary.doc2bow(aPhrase.lower().split())
veclsiDog= lsiDog[vecPhrase] # convert the query to LSI space

In [54]:
from operator import itemgetter
sorted(veclsiDog, key=itemgetter(1),reverse=True)


Out[54]:
[(1, 1.343991063839294),
 (0, 0.64408113020523772),
 (4, 0.22114439319510543),
 (3, 0.14019545357926735),
 (2, -1.268918087721125)]

In [55]:
veclsiCat= lsiCat[vecPhrase] # convert the query to LSI space
sorted(veclsiCat, key=itemgetter(1),reverse=True)


Out[55]:
[(0, 0.47756128610624693),
 (4, 0.34151377038310238),
 (3, 0.19470745296967434),
 (1, -0.023292724700136595),
 (5, -0.070387683050306432),
 (2, -0.078009803158084284)]

In [76]:
#inverse freqency model, increases value of rare words (bad choice)
model=models.tfidfmodel.TfidfModel(corpusCat,id2word=dictionary,normalize=True)
vectfid=model[vecPhrase]
print(sorted(vectfid, key=itemgetter(1),reverse=True))
print(model.id2word[26])


[(26, 0.6487560173001553), (44, 0.6487560173001553), (37, 0.39778293079728944)]
be

In [65]:
model=models.tfidfmodel.TfidfModel(corpusDog,id2word=dictionary,normalize=True)
vectfid=model[vecPhrase]
sorted(vectfid, key=itemgetter(1),reverse=True)


Out[65]:
[(26, 0.4516823307084841),
 (44, 0.4516823307084841),
 (63, 0.4516823307084841),
 (90, 0.4516823307084841),
 (37, 0.2769477530956665),
 (60, 0.2769477530956665),
 (59, 0.1747345776128177)]

In [82]:
#random projections
model=models.RpModel(corpusCat,id2word=dictionary)
print(sorted(model[vecPhrase], key=itemgetter(1),reverse=True)[-1])
print(model)

model=models.RpModel(corpusDog,id2word=dictionary)
print(sorted(model[vecPhrase], key=itemgetter(1),reverse=True)[-1])
print(model)


(290, -0.40414515137672424)
RpModel(num_terms=92, num_topics=300)
(139, -0.40414515137672424)
RpModel(num_terms=92, num_topics=300)

In [58]: