Notebook for playing around, testing, speed comparisons


In [1]:
from __future__ import division

import numpy as np
from gensim.models import Word2Vec 
import scipy.stats as stat
from scipy.special import gamma
import scipy
from numpy import pi
from numba import jit, vectorize, float32, float64
import time
import line_profiler
import nltk
import os
import cProfile
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)
%load_ext line_profiler
%load_ext autoreload

In [2]:
wordvecs = Word2Vec.load_word2vec_format(
    "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)


WARNING:gensim.models.word2vec:consider setting layer size to a multiple of 4 for greater performance

In [ ]:
len(wordvecs.vocab.keys())

In [ ]:
cd GaussianLDA/

In [86]:
import FastGaussianLDA
%aimport FastGaussianLDA
%autoreload 1
# import GaussianLDA
# reload(Gauss_LDA)
# reload(GaussianLDA)

# corpus = ['apple banana mango melon',
#  'barrel cannon gun war',
#  'democrat republican senate politics']
corpus2 = ["apple orange mango melon ", "canvas art mural paint painting ", "pineapple kiwi grape strawberry ",
              "picture frame picasso sculpture art ", "coconut guava blueberry blackberry ", "statue monument art artist "]
# corpus2 = [sent * 5 for sent in corpus2]*4
wordvec_fileapth = "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt"
start  = time.time()
g = FastGaussianLDA.Gauss_LDA(2, corpus2, word_vector_model=wordvecs)
g.fit(20)
print (time.time()-start)


Done processing corpus with 6 documents
There are 24 words that could be convereted to word vectors in your corpus 
There are 0 words that could NOT be converted to word vectors
Intialization complete
Starting fit
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
0 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
1 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
2 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
3 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
4 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
5 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
6 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
7 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
8 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
9 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
10 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
11 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
12 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
13 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
14 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
15 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
16 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
17 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
18 iterations complete
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
19 iterations complete
0.794059991837

old time was 237 secs on 30 iterations without NUMBA


In [58]:
np.loadtxt("/Users/michael/Documents/GaussianLDA/output/iter10topic0Topic Mean.txt")


Out[58]:
array([ 0.04701606,  0.06495298, -0.06701785,  0.12368204,  0.04580263,
        0.04126731, -0.15244943, -0.11145958, -0.00325656,  0.08293305,
       -0.03406654,  0.03001568,  0.16134068, -0.03776183,  0.04927907,
       -0.01430325, -0.08997902,  0.08577938,  0.039224  , -0.12702498,
        0.0261495 , -0.06010891,  0.04436867, -0.01811015,  0.00611593,
       -0.02520956, -0.09775412,  0.10198555,  0.05974979, -0.10871787,
        0.24333452, -0.09253282, -0.10494265,  0.10045677, -0.00074449,
        0.01816347, -0.1071244 ,  0.02481544,  0.04297808, -0.10480475,
        0.03284159,  0.00831267, -0.10501135, -0.10401834,  0.0992685 ,
        0.07110563, -0.01013705, -0.16764329,  0.00626053, -0.08539053])

In [50]:
np.savetxt("/Users/michael/Documents/GaussianLDA/output/test.txt", g.topic_params[0]["Topic Mean"])

In [ ]:


In [ ]:
mean1 = g.topic_params[0]["Topic Mean"]; mean2 = g.topic_params[1]["Topic Mean"]; mean3 = g.topic_params[2]["Topic Mean"]
wordvecs.most_similar(positive=[mean1])

In [ ]:
wordvecs.most_similar(positive=[mean2])

In [ ]:
wordvecs.most_similar(positive=[mean3])

In [ ]:
newdoc = "fruit apple orange mango taco painting sculpture book art artist picasso"

In [ ]:
def myfunc(mat):
    np.linalg.det(mat)
    return np.linalg.inv(mat)
    
%prun myfunc(np.arange(100).reshape(10,10))

Expirmenting with Numba accelerations:

  • Using numpy ufuncs

In [ ]:
import numpy as np

In [3]:
a = np.cov(np.arange(50.0**2, dtype=np.float32).reshape(50,50))
a = np.eye(50.)*4
# scipy.linalg.cholesky(a, lower=True, overwrite_a=True)
x = np.arange(50)

In [4]:
from numba import generated_jit, vectorize

In [5]:
b = np.arange(1000*1000).reshape(1000,1000)

In [56]:
@jit(cache=True)
def func(val):
    np.sum(val, axis=0)

In [57]:
%%timeit
func(b)


The slowest run took 63.91 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 991 µs per loop

In [58]:
%%timeit
def func(val):
    np.sum(val, axis=0)


10000000 loops, best of 3: 90.2 ns per loop
  • Using python operators

In [ ]:
@jit((float32[:,:]))
def chol_downdate(L, X):
        """
        Cholesky Rank 1 Update
        :param L: Lower triangle matrix from Cholesky Decomposition
        :param X: Word Vector with same column dimensionality as L
        :return: updated lower triangle matrix
        """
        # assert L.shape[1] == X.shape[0]
        # choldowndate(L.T, X)
        L_c = np.copy(L) # in-place computations are faster
        for k in range(X.shape[0]):
            r = np.sqrt(L_c[k, k]**2 - X[k]**2)
            c = r / L_c[k, k]
            s = X[k] / L_c[k, k]
            L_c[k, k] = r

            for i in range(k+1, X.shape[0]):
                L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
                X[i] = (c * X[i]) - (s * L_c[i, k])
        if np.isnan(r) or np.isinf(r):
            print "YOU GOT Nans or infs: learn to code better shmuck"
            return L # good reason for making copy - return if downdate becomes unstable
        return L_c

In [ ]:
%%timeit
chol_downdate(a, x)
  • Using numpy ufuncs is SLOWER

In [ ]:
@jit
def chol_downdate(L, X):
        """
        Cholesky Rank 1 Update
        :param L: Lower triangle matrix from Cholesky Decomposition
        :param X: Word Vector with same column dimensionality as L
        :return: updated lower triangle matrix
        """
        assert L.shape[1] == X.shape[0]
        # choldowndate(L.T, X)
        L_c = np.copy(L) # in-place computations are faster
        for k in range(X.shape[0]):
            r = np.sqrt(np.power(L_c[k, k],2) - np.power(X[k], 2))
            c = r / L_c[k, k]
            s = X[k] / L_c[k, k]
            L_c[k, k] = r

            for i in range(k+1, X.shape[0]):
                L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
                X[i] = (c * X[i]) - (s * L_c[i, k])
        if np.isnan(r) or np.isinf(r):
            print "YOU GOT Nans or infs: learn to code better shmuck"
            return L # good reason for making copy - return if downdate becomes unstable
        return L_c

In [ ]:
%%timeit
chol_downdate(a, x)

In [9]:
np.sum(b, axis=0)


Out[9]:
array([499500000, 499501000, 499502000, 499503000, 499504000, 499505000,
       499506000, 499507000, 499508000, 499509000, 499510000, 499511000,
       499512000, 499513000, 499514000, 499515000, 499516000, 499517000,
       499518000, 499519000, 499520000, 499521000, 499522000, 499523000,
       499524000, 499525000, 499526000, 499527000, 499528000, 499529000,
       499530000, 499531000, 499532000, 499533000, 499534000, 499535000,
       499536000, 499537000, 499538000, 499539000, 499540000, 499541000,
       499542000, 499543000, 499544000, 499545000, 499546000, 499547000,
       499548000, 499549000, 499550000, 499551000, 499552000, 499553000,
       499554000, 499555000, 499556000, 499557000, 499558000, 499559000,
       499560000, 499561000, 499562000, 499563000, 499564000, 499565000,
       499566000, 499567000, 499568000, 499569000, 499570000, 499571000,
       499572000, 499573000, 499574000, 499575000, 499576000, 499577000,
       499578000, 499579000, 499580000, 499581000, 499582000, 499583000,
       499584000, 499585000, 499586000, 499587000, 499588000, 499589000,
       499590000, 499591000, 499592000, 499593000, 499594000, 499595000,
       499596000, 499597000, 499598000, 499599000, 499600000, 499601000,
       499602000, 499603000, 499604000, 499605000, 499606000, 499607000,
       499608000, 499609000, 499610000, 499611000, 499612000, 499613000,
       499614000, 499615000, 499616000, 499617000, 499618000, 499619000,
       499620000, 499621000, 499622000, 499623000, 499624000, 499625000,
       499626000, 499627000, 499628000, 499629000, 499630000, 499631000,
       499632000, 499633000, 499634000, 499635000, 499636000, 499637000,
       499638000, 499639000, 499640000, 499641000, 499642000, 499643000,
       499644000, 499645000, 499646000, 499647000, 499648000, 499649000,
       499650000, 499651000, 499652000, 499653000, 499654000, 499655000,
       499656000, 499657000, 499658000, 499659000, 499660000, 499661000,
       499662000, 499663000, 499664000, 499665000, 499666000, 499667000,
       499668000, 499669000, 499670000, 499671000, 499672000, 499673000,
       499674000, 499675000, 499676000, 499677000, 499678000, 499679000,
       499680000, 499681000, 499682000, 499683000, 499684000, 499685000,
       499686000, 499687000, 499688000, 499689000, 499690000, 499691000,
       499692000, 499693000, 499694000, 499695000, 499696000, 499697000,
       499698000, 499699000, 499700000, 499701000, 499702000, 499703000,
       499704000, 499705000, 499706000, 499707000, 499708000, 499709000,
       499710000, 499711000, 499712000, 499713000, 499714000, 499715000,
       499716000, 499717000, 499718000, 499719000, 499720000, 499721000,
       499722000, 499723000, 499724000, 499725000, 499726000, 499727000,
       499728000, 499729000, 499730000, 499731000, 499732000, 499733000,
       499734000, 499735000, 499736000, 499737000, 499738000, 499739000,
       499740000, 499741000, 499742000, 499743000, 499744000, 499745000,
       499746000, 499747000, 499748000, 499749000, 499750000, 499751000,
       499752000, 499753000, 499754000, 499755000, 499756000, 499757000,
       499758000, 499759000, 499760000, 499761000, 499762000, 499763000,
       499764000, 499765000, 499766000, 499767000, 499768000, 499769000,
       499770000, 499771000, 499772000, 499773000, 499774000, 499775000,
       499776000, 499777000, 499778000, 499779000, 499780000, 499781000,
       499782000, 499783000, 499784000, 499785000, 499786000, 499787000,
       499788000, 499789000, 499790000, 499791000, 499792000, 499793000,
       499794000, 499795000, 499796000, 499797000, 499798000, 499799000,
       499800000, 499801000, 499802000, 499803000, 499804000, 499805000,
       499806000, 499807000, 499808000, 499809000, 499810000, 499811000,
       499812000, 499813000, 499814000, 499815000, 499816000, 499817000,
       499818000, 499819000, 499820000, 499821000, 499822000, 499823000,
       499824000, 499825000, 499826000, 499827000, 499828000, 499829000,
       499830000, 499831000, 499832000, 499833000, 499834000, 499835000,
       499836000, 499837000, 499838000, 499839000, 499840000, 499841000,
       499842000, 499843000, 499844000, 499845000, 499846000, 499847000,
       499848000, 499849000, 499850000, 499851000, 499852000, 499853000,
       499854000, 499855000, 499856000, 499857000, 499858000, 499859000,
       499860000, 499861000, 499862000, 499863000, 499864000, 499865000,
       499866000, 499867000, 499868000, 499869000, 499870000, 499871000,
       499872000, 499873000, 499874000, 499875000, 499876000, 499877000,
       499878000, 499879000, 499880000, 499881000, 499882000, 499883000,
       499884000, 499885000, 499886000, 499887000, 499888000, 499889000,
       499890000, 499891000, 499892000, 499893000, 499894000, 499895000,
       499896000, 499897000, 499898000, 499899000, 499900000, 499901000,
       499902000, 499903000, 499904000, 499905000, 499906000, 499907000,
       499908000, 499909000, 499910000, 499911000, 499912000, 499913000,
       499914000, 499915000, 499916000, 499917000, 499918000, 499919000,
       499920000, 499921000, 499922000, 499923000, 499924000, 499925000,
       499926000, 499927000, 499928000, 499929000, 499930000, 499931000,
       499932000, 499933000, 499934000, 499935000, 499936000, 499937000,
       499938000, 499939000, 499940000, 499941000, 499942000, 499943000,
       499944000, 499945000, 499946000, 499947000, 499948000, 499949000,
       499950000, 499951000, 499952000, 499953000, 499954000, 499955000,
       499956000, 499957000, 499958000, 499959000, 499960000, 499961000,
       499962000, 499963000, 499964000, 499965000, 499966000, 499967000,
       499968000, 499969000, 499970000, 499971000, 499972000, 499973000,
       499974000, 499975000, 499976000, 499977000, 499978000, 499979000,
       499980000, 499981000, 499982000, 499983000, 499984000, 499985000,
       499986000, 499987000, 499988000, 499989000, 499990000, 499991000,
       499992000, 499993000, 499994000, 499995000, 499996000, 499997000,
       499998000, 499999000, 500000000, 500001000, 500002000, 500003000,
       500004000, 500005000, 500006000, 500007000, 500008000, 500009000,
       500010000, 500011000, 500012000, 500013000, 500014000, 500015000,
       500016000, 500017000, 500018000, 500019000, 500020000, 500021000,
       500022000, 500023000, 500024000, 500025000, 500026000, 500027000,
       500028000, 500029000, 500030000, 500031000, 500032000, 500033000,
       500034000, 500035000, 500036000, 500037000, 500038000, 500039000,
       500040000, 500041000, 500042000, 500043000, 500044000, 500045000,
       500046000, 500047000, 500048000, 500049000, 500050000, 500051000,
       500052000, 500053000, 500054000, 500055000, 500056000, 500057000,
       500058000, 500059000, 500060000, 500061000, 500062000, 500063000,
       500064000, 500065000, 500066000, 500067000, 500068000, 500069000,
       500070000, 500071000, 500072000, 500073000, 500074000, 500075000,
       500076000, 500077000, 500078000, 500079000, 500080000, 500081000,
       500082000, 500083000, 500084000, 500085000, 500086000, 500087000,
       500088000, 500089000, 500090000, 500091000, 500092000, 500093000,
       500094000, 500095000, 500096000, 500097000, 500098000, 500099000,
       500100000, 500101000, 500102000, 500103000, 500104000, 500105000,
       500106000, 500107000, 500108000, 500109000, 500110000, 500111000,
       500112000, 500113000, 500114000, 500115000, 500116000, 500117000,
       500118000, 500119000, 500120000, 500121000, 500122000, 500123000,
       500124000, 500125000, 500126000, 500127000, 500128000, 500129000,
       500130000, 500131000, 500132000, 500133000, 500134000, 500135000,
       500136000, 500137000, 500138000, 500139000, 500140000, 500141000,
       500142000, 500143000, 500144000, 500145000, 500146000, 500147000,
       500148000, 500149000, 500150000, 500151000, 500152000, 500153000,
       500154000, 500155000, 500156000, 500157000, 500158000, 500159000,
       500160000, 500161000, 500162000, 500163000, 500164000, 500165000,
       500166000, 500167000, 500168000, 500169000, 500170000, 500171000,
       500172000, 500173000, 500174000, 500175000, 500176000, 500177000,
       500178000, 500179000, 500180000, 500181000, 500182000, 500183000,
       500184000, 500185000, 500186000, 500187000, 500188000, 500189000,
       500190000, 500191000, 500192000, 500193000, 500194000, 500195000,
       500196000, 500197000, 500198000, 500199000, 500200000, 500201000,
       500202000, 500203000, 500204000, 500205000, 500206000, 500207000,
       500208000, 500209000, 500210000, 500211000, 500212000, 500213000,
       500214000, 500215000, 500216000, 500217000, 500218000, 500219000,
       500220000, 500221000, 500222000, 500223000, 500224000, 500225000,
       500226000, 500227000, 500228000, 500229000, 500230000, 500231000,
       500232000, 500233000, 500234000, 500235000, 500236000, 500237000,
       500238000, 500239000, 500240000, 500241000, 500242000, 500243000,
       500244000, 500245000, 500246000, 500247000, 500248000, 500249000,
       500250000, 500251000, 500252000, 500253000, 500254000, 500255000,
       500256000, 500257000, 500258000, 500259000, 500260000, 500261000,
       500262000, 500263000, 500264000, 500265000, 500266000, 500267000,
       500268000, 500269000, 500270000, 500271000, 500272000, 500273000,
       500274000, 500275000, 500276000, 500277000, 500278000, 500279000,
       500280000, 500281000, 500282000, 500283000, 500284000, 500285000,
       500286000, 500287000, 500288000, 500289000, 500290000, 500291000,
       500292000, 500293000, 500294000, 500295000, 500296000, 500297000,
       500298000, 500299000, 500300000, 500301000, 500302000, 500303000,
       500304000, 500305000, 500306000, 500307000, 500308000, 500309000,
       500310000, 500311000, 500312000, 500313000, 500314000, 500315000,
       500316000, 500317000, 500318000, 500319000, 500320000, 500321000,
       500322000, 500323000, 500324000, 500325000, 500326000, 500327000,
       500328000, 500329000, 500330000, 500331000, 500332000, 500333000,
       500334000, 500335000, 500336000, 500337000, 500338000, 500339000,
       500340000, 500341000, 500342000, 500343000, 500344000, 500345000,
       500346000, 500347000, 500348000, 500349000, 500350000, 500351000,
       500352000, 500353000, 500354000, 500355000, 500356000, 500357000,
       500358000, 500359000, 500360000, 500361000, 500362000, 500363000,
       500364000, 500365000, 500366000, 500367000, 500368000, 500369000,
       500370000, 500371000, 500372000, 500373000, 500374000, 500375000,
       500376000, 500377000, 500378000, 500379000, 500380000, 500381000,
       500382000, 500383000, 500384000, 500385000, 500386000, 500387000,
       500388000, 500389000, 500390000, 500391000, 500392000, 500393000,
       500394000, 500395000, 500396000, 500397000, 500398000, 500399000,
       500400000, 500401000, 500402000, 500403000, 500404000, 500405000,
       500406000, 500407000, 500408000, 500409000, 500410000, 500411000,
       500412000, 500413000, 500414000, 500415000, 500416000, 500417000,
       500418000, 500419000, 500420000, 500421000, 500422000, 500423000,
       500424000, 500425000, 500426000, 500427000, 500428000, 500429000,
       500430000, 500431000, 500432000, 500433000, 500434000, 500435000,
       500436000, 500437000, 500438000, 500439000, 500440000, 500441000,
       500442000, 500443000, 500444000, 500445000, 500446000, 500447000,
       500448000, 500449000, 500450000, 500451000, 500452000, 500453000,
       500454000, 500455000, 500456000, 500457000, 500458000, 500459000,
       500460000, 500461000, 500462000, 500463000, 500464000, 500465000,
       500466000, 500467000, 500468000, 500469000, 500470000, 500471000,
       500472000, 500473000, 500474000, 500475000, 500476000, 500477000,
       500478000, 500479000, 500480000, 500481000, 500482000, 500483000,
       500484000, 500485000, 500486000, 500487000, 500488000, 500489000,
       500490000, 500491000, 500492000, 500493000, 500494000, 500495000,
       500496000, 500497000, 500498000, 500499000])

In [14]:
np.einsum('ik', b)


Out[14]:
array([[     0,      1,      2, ...,    997,    998,    999],
       [  1000,   1001,   1002, ...,   1997,   1998,   1999],
       [  2000,   2001,   2002, ...,   2997,   2998,   2999],
       ..., 
       [997000, 997001, 997002, ..., 997997, 997998, 997999],
       [998000, 998001, 998002, ..., 998997, 998998, 998999],
       [999000, 999001, 999002, ..., 999997, 999998, 999999]])

In [ ]:


In [115]:
w = "word"
len(w)


Out[115]:
4

In [ ]:
cd ../positive_text

In [3]:
stopwords = set(nltk.corpus.stopwords.words(fileids='english'))

In [ ]:


In [4]:
directory = "/Users/michael/Documents/positive_text/"
docs = []
vocab = set(wordvecs.vocab.keys())
for path in os.listdir(directory)[1:]:
    for subpath in os.listdir(directory + path):
        try:
            with open(directory + path + '/' + subpath, 'r') as f:
                txt = f.read()
                txt = txt.split()                   
                txt = [word for word in txt if word in vocab 
                       and word.isalpha() 
                      and word not in stopwords 
                       and word != "ve" 
                       and len(word) > 1]
                txt = ' '.join(txt)
                if len(txt) > 5: docs.append(txt)
                f.close()
        except IOError: 
            for subsubpath in os.listdir(directory + path + '/' + subpath):
                try:
                    with open(directory + path + '/' + subpath + '/' + subsubpath, 'r') as f:
                        txt = f.read()
                        txt = txt.split()
                        txt = [word for word in txt if word in vocab 
                               and word.isalpha() 
                              and word not in stopwords 
                               and word != "ve" 
                               and len(word)>1]
                        txt = ' '.join(txt)
                        docs.append(txt)
                except IOError: 
                    continue

In [ ]:
import numpy as np

In [ ]:
!touch cleandocs.txt

In [ ]:
pwd/

In [6]:
from operator import itemgetter

In [9]:
subsample = itemgetter(*np.random.random_integers(0, 1650, 500).tolist())

In [ ]:


In [11]:
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
# np.savetxt(f, docs)
with open(f, 'w') as fi:
    for doc in subsample(docs):
        fi.write("%s\n" % doc)

In [30]:
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
with open(f, 'r') as fi:
    text = fi.read().splitlines()
    fi.close()

In [ ]:
cd Insurance/

Proof that my method of doing np.sum on a array of lists was slower than alt methods


In [98]:
b= np.random.random_sample(size=(10000, 50))

In [99]:
%%timeit
c = np.zeros(50)
for line in b:
    c += line


100 loops, best of 3: 8.41 ms per loop

In [100]:
%%timeit
arrs = []
for line in b:
    arrs.append(line)
d = np.array(arrs) 
np.sum(d, axis=0)


100 loops, best of 3: 11.3 ms per loop

In [105]:
a = {"a":1, "b": 2}

In [107]:
[k for k, v in a.iteritems() if v == 1]


Out[107]:
['a']

In [109]:
b = set([1, 2, 3])
c = set([2, 3,4,5])

In [114]:
d = set(["children", "contact", "holla"])

In [113]:
(docs[1], docs[2])


Out[113]:
'children adults adults give permission contact patient give please ensure copy release form medical give permission contact patient give please ensure copy release form medical treatment appropriate details check relevant describe checked items comments section noted thoughts passive acute check relevant describe comments section noted thoughts passive acute reactions member currently prescribed medication yes please indicate prescribed prescribed dates initial prescription name doctor prescribing medication check indicate member adherent risks benefits medication adherence discussed member taking medications yes please list indicate previous dates previous treatment response treatment interventions responses clinical data recent lab tests consultation reports physicians patients age milestones met behavioral health inpatient admissions multiple behavioral diagnosis ideation created document sample tool assist providers require use collecting information contained treatment goals must objective estimated time frames treatment plan developed understanding treatment plan documented medical item indicate outcome measures evidenced referral consistent treatment therapist developed plan agreement working issues understand treatment goals developed created document sample tool assist providers require use collecting information contained completed within days last visit date last patient agreement termination patient return scheduled list made contact patient treatment goals treatment goals appropriate referrals must appropriate patient became unable conduct activities daily living course patient referred appropriate level created document sample tool assist providers require use collecting information contained'

In [2]:
from sklearn.cluster import KMeans

In [3]:
km = KMeans(n_clusters=15, n_jobs=-1)

In [ ]: