In [1]:
from htmresearch.encoders.cio_encoder import CioEncoder

In [2]:
%matplotlib inline

In [3]:
import matplotlib.pyplot as plt

In [4]:
import matplotlib.image as mpimg

In [5]:
import numpy

In [6]:
encoder = CioEncoder(retina="en_synonymous")

In [7]:
def plotText(text):
    fingerprint = encoder.encode(text)
    img = numpy.zeros(128*128)
    img[list(fingerprint['fingerprint']['positions'])]= 1
    img = img.reshape((128,128))
    plt.imshow(img, cmap="spectral")
    print "Sparsity=",fingerprint['sparsity'],"bits=",len(fingerprint['fingerprint']['positions'])

In [8]:
def compareText(text1, text2):
    f1 = encoder.encode(text1)['fingerprint']['positions']
    f2 = encoder.encode(text2)['fingerprint']['positions']
    print "Bits for '"+text1+"' = ",len(f1)
    print "Bits for '"+text2+"' = ",len(f2)
    overlap = len(set(f1) & set(f2))
    print "Overlap=",overlap
    return overlap

In [9]:
plotText("James quit smoking but it was not an easy decision.")


Sparsity= 0.0362548828125 bits= 594

In [10]:
compareText("James quit smoking but it was not an easy decision.", 
            "James stopped smoking but it was not easy decision")


Bits for 'James quit smoking but it was not an easy decision.' =  594
Bits for 'James stopped smoking but it was not easy decision' =  594
Overlap= 594
Out[10]:
594

In [11]:
compareText("James quit smoking but it was not an easy decision.", 
            "James stopped smoking but it was not a simple decision")


Bits for 'James quit smoking but it was not an easy decision.' =  594
Bits for 'James stopped smoking but it was not a simple decision' =  602
Overlap= 501
Out[11]:
501

In [12]:
compareText("James quit smoking but it was not an easy decision.", 
            "James stopped smoking but it was not a simple choice")


Bits for 'James quit smoking but it was not an easy decision.' =  594
Bits for 'James stopped smoking but it was not a simple choice' =  570
Overlap= 392
Out[12]:
392

In [13]:
compareText("James quit smoking but it was not an easy decision.", 
            "Sue stopped smoking but it was not a simple choice")


Bits for 'James quit smoking but it was not an easy decision.' =  594
Bits for 'Sue stopped smoking but it was not a simple choice' =  559
Overlap= 294
Out[13]:
294

In [14]:
compareText("James quit smoking but it was not an easy decision.", 
            "Sue stopped cigarettes but it was not a simple choice")


Bits for 'James quit smoking but it was not an easy decision.' =  594
Bits for 'Sue stopped cigarettes but it was not a simple choice' =  693
Overlap= 237
Out[14]:
237

In [15]:
compareText("James quit smoking but it was not an easy decision.", 
            "We start with ten base sentences each with ten words")


Bits for 'James quit smoking but it was not an easy decision.' =  594
Bits for 'We start with ten base sentences each with ten words' =  298
Overlap= 65
Out[15]:
65

In [16]:
compareText("James quit smoking but it was not an easy decision.", 
            "Sue keeps smoking a lot of cigarettes")


Bits for 'James quit smoking but it was not an easy decision.' =  594
Bits for 'Sue keeps smoking a lot of cigarettes' =  549
Overlap= 263
Out[16]:
263

In [17]:
plotText("the ski lift of projectors requires random resorts vacation chairs kids")


Sparsity= 0.063720703125 bits= 1044

In [18]:
plotText("biology")


Sparsity= 0.010009765625 bits= 164

In [19]:
compareText("mountains vacation","skiing")


Bits for 'mountains vacation' =  318
Bits for 'skiing' =  164
Overlap= 24
Out[19]:
24

In [20]:
compareText("James loved to puff on his cigarettes. However he recently read a bunch of articles describing their unhealthy effects. James decided to quit cigarettes completely. It clearly was not an easy decision, but he felt it was the right one.", 
           "The tobacco industry has long tried to hide the effects of second hand smoke. However there is now a preponderance of evidence demonstrating its ill effects. Second hand smoke is clearly dangerous to anyone who breathes it in.")


Bits for 'James loved to puff on his cigarettes. However he recently read a bunch of articles describing their unhealthy effects. James decided to quit cigarettes completely. It clearly was not an easy decision, but he felt it was the right one.' =  1197
Bits for 'The tobacco industry has long tried to hide the effects of second hand smoke. However there is now a preponderance of evidence demonstrating its ill effects. Second hand smoke is clearly dangerous to anyone who breathes it in.' =  1259
Overlap= 495
Out[20]:
495

In [21]:
compareText("James loved to puff on his cigarettes. However he recently read a bunch of articles describing their unhealthy effects. James decided to quit cigarettes completely. It clearly was not an easy decision, but he felt it was the right one.", 
           "the dangers of cigars. a cigar is basically a cancer stick. Cigars are known to contain over 7000 chemicals, at least 250 of which are directly tied to lung cancer.")


Bits for 'James loved to puff on his cigarettes. However he recently read a bunch of articles describing their unhealthy effects. James decided to quit cigarettes completely. It clearly was not an easy decision, but he felt it was the right one.' =  1197
Bits for 'the dangers of cigars. a cigar is basically a cancer stick. Cigars are known to contain over 7000 chemicals, at least 250 of which are directly tied to lung cancer.' =  985
Overlap= 337
Out[21]:
337

In [22]:
compareText("James loved to puff on his cigarettes. However he recently read a bunch of articles describing their unhealthy effects. James decided to quit cigarettes completely. It clearly was not an easy decision, but he felt it was the right one.", 
           "people with health problems are turning to jogging as a solution. It raises heart rate and causes a rush of endorphins to the body. It also makes people more resilient to disease. It's the ultimate preventative medicine for health issues.")


Bits for 'James loved to puff on his cigarettes. However he recently read a bunch of articles describing their unhealthy effects. James decided to quit cigarettes completely. It clearly was not an easy decision, but he felt it was the right one.' =  1197
Bits for 'people with health problems are turning to jogging as a solution. It raises heart rate and causes a rush of endorphins to the body. It also makes people more resilient to disease. It's the ultimate preventative medicine for health issues.' =  1362
Overlap= 392
Out[22]:
392

In [23]:
compareText("James","tobacco")


Bits for 'James' =  164
Bits for 'tobacco' =  164
Overlap= 10
Out[23]:
10

In [24]:
plotText("advsh")


Sparsity= 0.00994873046875 bits= 163

In [ ]: