Notebook for playing around, testing, speed comparisons



In [1]:

    
from __future__ import division

import numpy as np
from gensim.models import Word2Vec 
import scipy.stats as stat
from scipy.special import gamma
import scipy
from numpy import pi
from numba import jit, vectorize, float32, float64
import time
import line_profiler
import nltk
import os
import cProfile
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)
%load_ext line_profiler
%load_ext autoreload



In [26]:

    
wordvecs = Word2Vec.load_word2vec_format(
    "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)









    



WARNING:gensim.models.word2vec:consider setting layer size to a multiple of 4 for greater performance



In [ ]:

    
len(wordvecs.vocab.keys())



In [ ]:

    
cd GaussianLDA/



In [56]:

    
import FastGaussianLDA
%aimport FastGaussianLDA
%autoreload 1
# import GaussianLDA
# reload(Gauss_LDA)
# reload(GaussianLDA)

# corpus = ['apple banana mango melon',
#  'barrel cannon gun war',
#  'democrat republican senate politics']
corpus2 = ["apple orange mango melon ", "canvas art mural paint painting ", "pineapple kiwi grape strawberry ",
              "picture frame picasso sculpture art ", "coconut guava blueberry blackberry ", "statue monument art artist "]
# corpus2 = [sent * 5 for sent in corpus2]*4
wordvec_fileapth = "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt"
start  = time.time()
g = FastGaussianLDA.Gauss_LDA(2, corpus2, word_vector_model=wordvecs)
g.fit(20)
print (time.time()-start)









    



Done processing corpus with 6 documents
There are 24 words that could be convereted to word vectors in your corpus 
There are 0 words that could NOT be converted to word vectors
Intialization complete
Starting fit
apple
orange
mango
melon
########################
 ###############
  ################# 0
canvas
art
mural
paint
painting
########################
 ###############
  ################# 1
pineapple
kiwi
grape
strawberry
########################
 ###############
  ################# 2
picture
frame
picasso
sculpture
art
########################
 ###############
  ################# 3
coconut
guava
blueberry
blackberry
########################
 ###############
  ################# 4
statue
monument
art
artist
########################
 ###############
  ################# 5
0 iterations complete






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-56-849d38e6d5c8> in <module>()
     15 start  = time.time()
     16 g = FastGaussianLDA.Gauss_LDA(2, corpus2, word_vector_model=wordvecs)
---> 17 g.fit(20)
     18 print (time.time()-start)

/Users/michael/Documents/GaussianLDA/FastGaussianLDA.py in fit(self, iterations, init)
    134                     results_file = "/Users/michael/Documents/GaussianLDA/output/iter{0}topic{1}{2}.txt".format(i, k, param)
    135                     open(results_file, 'w')
--> 136                     np.savetxt(results_file, self.topic_params[k][param])
    137 
    138     def init(self):

KeyError: 'Topic Covariance'

old time was 237 secs on 30 iterations without NUMBA



In [49]:

    
g.topic_params









    Out[49]:





defaultdict(dict,
            {0: {'Chol Det': 60.460999900544195,
              'Lower Triangle': array([[ 1.8254134 ,  0.        ,  0.        , ...,  0.        ,
                       0.        ,  0.        ],
                     [-0.06680332,  1.98160203,  0.        , ...,  0.        ,
                       0.        ,  0.        ],
                     [ 0.08098257, -0.03744569,  1.90105757, ...,  0.        ,
                       0.        ,  0.        ],
                     ..., 
                     [-0.00757099, -0.13980339,  0.01847824, ...,  1.82585866,
                       0.        ,  0.        ],
                     [ 0.0211067 , -0.07333534,  0.01993555, ..., -0.0033345 ,
                       1.7805862 ,  0.        ],
                     [-0.08579739, -0.03275877, -0.02742346, ...,  0.05692567,
                      -0.02328802,  1.79330621]]),
              'Topic Count': 7.0,
              'Topic Kappa': 7.0099999999999998,
              'Topic Mean': array([-0.02504542,  0.2188665 , -0.17538328,  0.02145378,  0.11539563,
                     -0.06780591, -0.18270968, -0.27622795, -0.07056386,  0.0993828 ,
                     -0.04235218,  0.01072836,  0.14253509, -0.02254983,  0.05916563,
                     -0.05124792, -0.03881309,  0.05071992, -0.03317038, -0.13346982,
                      0.13906787, -0.03837078, -0.12836997, -0.08083544,  0.02219288,
                     -0.16808645, -0.28662694,  0.08816469,  0.01411173, -0.18738835,
                      0.511033  , -0.165015  , -0.04084815, -0.1611245 , -0.0859283 ,
                      0.12275199, -0.0469765 ,  0.11622649,  0.00719758, -0.07863593,
                      0.09010109, -0.01056698,  0.01697799, -0.10904516,  0.07542315,
                      0.10887135,  0.03931673, -0.22873025, -0.00079732, -0.12177765], dtype=float32),
              'Topic Nu': 57.0},
             1: {'Chol Det': 61.188559372852168,
              'Lower Triangle': array([[ 1.84495746,  0.        ,  0.        , ...,  0.        ,
                       0.        ,  0.        ],
                     [-0.0112136 ,  2.02942633,  0.        , ...,  0.        ,
                       0.        ,  0.        ],
                     [ 0.07961683,  0.00438719,  1.93522247, ...,  0.        ,
                       0.        ,  0.        ],
                     ..., 
                     [-0.02836843, -0.09940096, -0.05450467, ...,  1.82035156,
                       0.        ,  0.        ],
                     [ 0.01260772, -0.09793474,  0.01187377, ..., -0.00602112,
                       1.78691005,  0.        ],
                     [-0.08285196,  0.01008334, -0.02592057, ...,  0.06094575,
                      -0.04099915,  1.79651351]]),
              'Topic Count': 19.0,
              'Topic Kappa': 19.010000000000002,
              'Topic Mean': array([ 0.01110858,  0.01746882, -0.08944207,  0.04714822,  0.06973268,
                      0.04762921, -0.10316735, -0.08513957, -0.01927804,  0.07138942,
                     -0.02017127,  0.02703014,  0.10562637,  0.01041498,  0.03951097,
                     -0.03484849,  0.01087443,  0.06866364,  0.00883327, -0.1412762 ,
                     -0.00565969, -0.04856285,  0.02408766, -0.04376699, -0.00129946,
                      0.02005883, -0.07292756,  0.11269723,  0.06587683, -0.09289835,
                      0.1859474 , -0.09804082, -0.07072636,  0.07159687,  0.00420334,
                      0.0655855 , -0.07414272,  0.05887394,  0.06787711, -0.08478291,
                      0.01003541, -0.01307604, -0.06682431, -0.0605235 ,  0.07158273,
                      0.03403937,  0.0234981 , -0.12305666,  0.01108958, -0.08817945], dtype=float32),
              'Topic Nu': 69.0}})



In [50]:

    
np.savetxt("/Users/michael/Documents/GaussianLDA/output/test.txt", g.topic_params[0]["Topic Mean"])



In [ ]:



In [ ]:

    
mean1 = g.topic_params[0]["Topic Mean"]; mean2 = g.topic_params[1]["Topic Mean"]; mean3 = g.topic_params[2]["Topic Mean"]
wordvecs.most_similar(positive=[mean1])



In [ ]:

    
wordvecs.most_similar(positive=[mean2])



In [ ]:

    
wordvecs.most_similar(positive=[mean3])



In [ ]:

    
newdoc = "fruit apple orange mango taco painting sculpture book art artist picasso"



In [ ]:

    
def myfunc(mat):
    np.linalg.det(mat)
    return np.linalg.inv(mat)
    
%prun myfunc(np.arange(100).reshape(10,10))

Expirmenting with Numba accelerations:

Using numpy ufuncs



In [ ]:

    
import numpy as np



In [3]:

    
a = np.cov(np.arange(50.0**2, dtype=np.float32).reshape(50,50))
a = np.eye(50.)*4
# scipy.linalg.cholesky(a, lower=True, overwrite_a=True)
x = np.arange(50)



In [4]:

    
from numba import generated_jit, vectorize



In [5]:

    
b = np.arange(1000*1000).reshape(1000,1000)



In [56]:

    
@jit(cache=True)
def func(val):
    np.sum(val, axis=0)



In [57]:

    
%%timeit
func(b)









    



The slowest run took 63.91 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 991 µs per loop



In [58]:

    
%%timeit
def func(val):
    np.sum(val, axis=0)









    



10000000 loops, best of 3: 90.2 ns per loop

Using python operators



In [ ]:

    
@jit((float32[:,:]))
def chol_downdate(L, X):
        """
        Cholesky Rank 1 Update
        :param L: Lower triangle matrix from Cholesky Decomposition
        :param X: Word Vector with same column dimensionality as L
        :return: updated lower triangle matrix
        """
        # assert L.shape[1] == X.shape[0]
        # choldowndate(L.T, X)
        L_c = np.copy(L) # in-place computations are faster
        for k in range(X.shape[0]):
            r = np.sqrt(L_c[k, k]**2 - X[k]**2)
            c = r / L_c[k, k]
            s = X[k] / L_c[k, k]
            L_c[k, k] = r

            for i in range(k+1, X.shape[0]):
                L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
                X[i] = (c * X[i]) - (s * L_c[i, k])
        if np.isnan(r) or np.isinf(r):
            print "YOU GOT Nans or infs: learn to code better shmuck"
            return L # good reason for making copy - return if downdate becomes unstable
        return L_c



In [ ]:

    
%%timeit
chol_downdate(a, x)

Using numpy ufuncs is SLOWER



In [ ]:

    
@jit
def chol_downdate(L, X):
        """
        Cholesky Rank 1 Update
        :param L: Lower triangle matrix from Cholesky Decomposition
        :param X: Word Vector with same column dimensionality as L
        :return: updated lower triangle matrix
        """
        assert L.shape[1] == X.shape[0]
        # choldowndate(L.T, X)
        L_c = np.copy(L) # in-place computations are faster
        for k in range(X.shape[0]):
            r = np.sqrt(np.power(L_c[k, k],2) - np.power(X[k], 2))
            c = r / L_c[k, k]
            s = X[k] / L_c[k, k]
            L_c[k, k] = r

            for i in range(k+1, X.shape[0]):
                L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
                X[i] = (c * X[i]) - (s * L_c[i, k])
        if np.isnan(r) or np.isinf(r):
            print "YOU GOT Nans or infs: learn to code better shmuck"
            return L # good reason for making copy - return if downdate becomes unstable
        return L_c



In [ ]:

    
%%timeit
chol_downdate(a, x)



In [9]:

    
np.sum(b, axis=0)









    Out[9]:





array([499500000, 499501000, 499502000, 499503000, 499504000, 499505000,
       499506000, 499507000, 499508000, 499509000, 499510000, 499511000,
       499512000, 499513000, 499514000, 499515000, 499516000, 499517000,
       499518000, 499519000, 499520000, 499521000, 499522000, 499523000,
       499524000, 499525000, 499526000, 499527000, 499528000, 499529000,
       499530000, 499531000, 499532000, 499533000, 499534000, 499535000,
       499536000, 499537000, 499538000, 499539000, 499540000, 499541000,
       499542000, 499543000, 499544000, 499545000, 499546000, 499547000,
       499548000, 499549000, 499550000, 499551000, 499552000, 499553000,
       499554000, 499555000, 499556000, 499557000, 499558000, 499559000,
       499560000, 499561000, 499562000, 499563000, 499564000, 499565000,
       499566000, 499567000, 499568000, 499569000, 499570000, 499571000,
       499572000, 499573000, 499574000, 499575000, 499576000, 499577000,
       499578000, 499579000, 499580000, 499581000, 499582000, 499583000,
       499584000, 499585000, 499586000, 499587000, 499588000, 499589000,
       499590000, 499591000, 499592000, 499593000, 499594000, 499595000,
       499596000, 499597000, 499598000, 499599000, 499600000, 499601000,
       499602000, 499603000, 499604000, 499605000, 499606000, 499607000,
       499608000, 499609000, 499610000, 499611000, 499612000, 499613000,
       499614000, 499615000, 499616000, 499617000, 499618000, 499619000,
       499620000, 499621000, 499622000, 499623000, 499624000, 499625000,
       499626000, 499627000, 499628000, 499629000, 499630000, 499631000,
       499632000, 499633000, 499634000, 499635000, 499636000, 499637000,
       499638000, 499639000, 499640000, 499641000, 499642000, 499643000,
       499644000, 499645000, 499646000, 499647000, 499648000, 499649000,
       499650000, 499651000, 499652000, 499653000, 499654000, 499655000,
       499656000, 499657000, 499658000, 499659000, 499660000, 499661000,
       499662000, 499663000, 499664000, 499665000, 499666000, 499667000,
       499668000, 499669000, 499670000, 499671000, 499672000, 499673000,
       499674000, 499675000, 499676000, 499677000, 499678000, 499679000,
       499680000, 499681000, 499682000, 499683000, 499684000, 499685000,
       499686000, 499687000, 499688000, 499689000, 499690000, 499691000,
       499692000, 499693000, 499694000, 499695000, 499696000, 499697000,
       499698000, 499699000, 499700000, 499701000, 499702000, 499703000,
       499704000, 499705000, 499706000, 499707000, 499708000, 499709000,
       499710000, 499711000, 499712000, 499713000, 499714000, 499715000,
       499716000, 499717000, 499718000, 499719000, 499720000, 499721000,
       499722000, 499723000, 499724000, 499725000, 499726000, 499727000,
       499728000, 499729000, 499730000, 499731000, 499732000, 499733000,
       499734000, 499735000, 499736000, 499737000, 499738000, 499739000,
       499740000, 499741000, 499742000, 499743000, 499744000, 499745000,
       499746000, 499747000, 499748000, 499749000, 499750000, 499751000,
       499752000, 499753000, 499754000, 499755000, 499756000, 499757000,
       499758000, 499759000, 499760000, 499761000, 499762000, 499763000,
       499764000, 499765000, 499766000, 499767000, 499768000, 499769000,
       499770000, 499771000, 499772000, 499773000, 499774000, 499775000,
       499776000, 499777000, 499778000, 499779000, 499780000, 499781000,
       499782000, 499783000, 499784000, 499785000, 499786000, 499787000,
       499788000, 499789000, 499790000, 499791000, 499792000, 499793000,
       499794000, 499795000, 499796000, 499797000, 499798000, 499799000,
       499800000, 499801000, 499802000, 499803000, 499804000, 499805000,
       499806000, 499807000, 499808000, 499809000, 499810000, 499811000,
       499812000, 499813000, 499814000, 499815000, 499816000, 499817000,
       499818000, 499819000, 499820000, 499821000, 499822000, 499823000,
       499824000, 499825000, 499826000, 499827000, 499828000, 499829000,
       499830000, 499831000, 499832000, 499833000, 499834000, 499835000,
       499836000, 499837000, 499838000, 499839000, 499840000, 499841000,
       499842000, 499843000, 499844000, 499845000, 499846000, 499847000,
       499848000, 499849000, 499850000, 499851000, 499852000, 499853000,
       499854000, 499855000, 499856000, 499857000, 499858000, 499859000,
       499860000, 499861000, 499862000, 499863000, 499864000, 499865000,
       499866000, 499867000, 499868000, 499869000, 499870000, 499871000,
       499872000, 499873000, 499874000, 499875000, 499876000, 499877000,
       499878000, 499879000, 499880000, 499881000, 499882000, 499883000,
       499884000, 499885000, 499886000, 499887000, 499888000, 499889000,
       499890000, 499891000, 499892000, 499893000, 499894000, 499895000,
       499896000, 499897000, 499898000, 499899000, 499900000, 499901000,
       499902000, 499903000, 499904000, 499905000, 499906000, 499907000,
       499908000, 499909000, 499910000, 499911000, 499912000, 499913000,
       499914000, 499915000, 499916000, 499917000, 499918000, 499919000,
       499920000, 499921000, 499922000, 499923000, 499924000, 499925000,
       499926000, 499927000, 499928000, 499929000, 499930000, 499931000,
       499932000, 499933000, 499934000, 499935000, 499936000, 499937000,
       499938000, 499939000, 499940000, 499941000, 499942000, 499943000,
       499944000, 499945000, 499946000, 499947000, 499948000, 499949000,
       499950000, 499951000, 499952000, 499953000, 499954000, 499955000,
       499956000, 499957000, 499958000, 499959000, 499960000, 499961000,
       499962000, 499963000, 499964000, 499965000, 499966000, 499967000,
       499968000, 499969000, 499970000, 499971000, 499972000, 499973000,
       499974000, 499975000, 499976000, 499977000, 499978000, 499979000,
       499980000, 499981000, 499982000, 499983000, 499984000, 499985000,
       499986000, 499987000, 499988000, 499989000, 499990000, 499991000,
       499992000, 499993000, 499994000, 499995000, 499996000, 499997000,
       499998000, 499999000, 500000000, 500001000, 500002000, 500003000,
       500004000, 500005000, 500006000, 500007000, 500008000, 500009000,
       500010000, 500011000, 500012000, 500013000, 500014000, 500015000,
       500016000, 500017000, 500018000, 500019000, 500020000, 500021000,
       500022000, 500023000, 500024000, 500025000, 500026000, 500027000,
       500028000, 500029000, 500030000, 500031000, 500032000, 500033000,
       500034000, 500035000, 500036000, 500037000, 500038000, 500039000,
       500040000, 500041000, 500042000, 500043000, 500044000, 500045000,
       500046000, 500047000, 500048000, 500049000, 500050000, 500051000,
       500052000, 500053000, 500054000, 500055000, 500056000, 500057000,
       500058000, 500059000, 500060000, 500061000, 500062000, 500063000,
       500064000, 500065000, 500066000, 500067000, 500068000, 500069000,
       500070000, 500071000, 500072000, 500073000, 500074000, 500075000,
       500076000, 500077000, 500078000, 500079000, 500080000, 500081000,
       500082000, 500083000, 500084000, 500085000, 500086000, 500087000,
       500088000, 500089000, 500090000, 500091000, 500092000, 500093000,
       500094000, 500095000, 500096000, 500097000, 500098000, 500099000,
       500100000, 500101000, 500102000, 500103000, 500104000, 500105000,
       500106000, 500107000, 500108000, 500109000, 500110000, 500111000,
       500112000, 500113000, 500114000, 500115000, 500116000, 500117000,
       500118000, 500119000, 500120000, 500121000, 500122000, 500123000,
       500124000, 500125000, 500126000, 500127000, 500128000, 500129000,
       500130000, 500131000, 500132000, 500133000, 500134000, 500135000,
       500136000, 500137000, 500138000, 500139000, 500140000, 500141000,
       500142000, 500143000, 500144000, 500145000, 500146000, 500147000,
       500148000, 500149000, 500150000, 500151000, 500152000, 500153000,
       500154000, 500155000, 500156000, 500157000, 500158000, 500159000,
       500160000, 500161000, 500162000, 500163000, 500164000, 500165000,
       500166000, 500167000, 500168000, 500169000, 500170000, 500171000,
       500172000, 500173000, 500174000, 500175000, 500176000, 500177000,
       500178000, 500179000, 500180000, 500181000, 500182000, 500183000,
       500184000, 500185000, 500186000, 500187000, 500188000, 500189000,
       500190000, 500191000, 500192000, 500193000, 500194000, 500195000,
       500196000, 500197000, 500198000, 500199000, 500200000, 500201000,
       500202000, 500203000, 500204000, 500205000, 500206000, 500207000,
       500208000, 500209000, 500210000, 500211000, 500212000, 500213000,
       500214000, 500215000, 500216000, 500217000, 500218000, 500219000,
       500220000, 500221000, 500222000, 500223000, 500224000, 500225000,
       500226000, 500227000, 500228000, 500229000, 500230000, 500231000,
       500232000, 500233000, 500234000, 500235000, 500236000, 500237000,
       500238000, 500239000, 500240000, 500241000, 500242000, 500243000,
       500244000, 500245000, 500246000, 500247000, 500248000, 500249000,
       500250000, 500251000, 500252000, 500253000, 500254000, 500255000,
       500256000, 500257000, 500258000, 500259000, 500260000, 500261000,
       500262000, 500263000, 500264000, 500265000, 500266000, 500267000,
       500268000, 500269000, 500270000, 500271000, 500272000, 500273000,
       500274000, 500275000, 500276000, 500277000, 500278000, 500279000,
       500280000, 500281000, 500282000, 500283000, 500284000, 500285000,
       500286000, 500287000, 500288000, 500289000, 500290000, 500291000,
       500292000, 500293000, 500294000, 500295000, 500296000, 500297000,
       500298000, 500299000, 500300000, 500301000, 500302000, 500303000,
       500304000, 500305000, 500306000, 500307000, 500308000, 500309000,
       500310000, 500311000, 500312000, 500313000, 500314000, 500315000,
       500316000, 500317000, 500318000, 500319000, 500320000, 500321000,
       500322000, 500323000, 500324000, 500325000, 500326000, 500327000,
       500328000, 500329000, 500330000, 500331000, 500332000, 500333000,
       500334000, 500335000, 500336000, 500337000, 500338000, 500339000,
       500340000, 500341000, 500342000, 500343000, 500344000, 500345000,
       500346000, 500347000, 500348000, 500349000, 500350000, 500351000,
       500352000, 500353000, 500354000, 500355000, 500356000, 500357000,
       500358000, 500359000, 500360000, 500361000, 500362000, 500363000,
       500364000, 500365000, 500366000, 500367000, 500368000, 500369000,
       500370000, 500371000, 500372000, 500373000, 500374000, 500375000,
       500376000, 500377000, 500378000, 500379000, 500380000, 500381000,
       500382000, 500383000, 500384000, 500385000, 500386000, 500387000,
       500388000, 500389000, 500390000, 500391000, 500392000, 500393000,
       500394000, 500395000, 500396000, 500397000, 500398000, 500399000,
       500400000, 500401000, 500402000, 500403000, 500404000, 500405000,
       500406000, 500407000, 500408000, 500409000, 500410000, 500411000,
       500412000, 500413000, 500414000, 500415000, 500416000, 500417000,
       500418000, 500419000, 500420000, 500421000, 500422000, 500423000,
       500424000, 500425000, 500426000, 500427000, 500428000, 500429000,
       500430000, 500431000, 500432000, 500433000, 500434000, 500435000,
       500436000, 500437000, 500438000, 500439000, 500440000, 500441000,
       500442000, 500443000, 500444000, 500445000, 500446000, 500447000,
       500448000, 500449000, 500450000, 500451000, 500452000, 500453000,
       500454000, 500455000, 500456000, 500457000, 500458000, 500459000,
       500460000, 500461000, 500462000, 500463000, 500464000, 500465000,
       500466000, 500467000, 500468000, 500469000, 500470000, 500471000,
       500472000, 500473000, 500474000, 500475000, 500476000, 500477000,
       500478000, 500479000, 500480000, 500481000, 500482000, 500483000,
       500484000, 500485000, 500486000, 500487000, 500488000, 500489000,
       500490000, 500491000, 500492000, 500493000, 500494000, 500495000,
       500496000, 500497000, 500498000, 500499000])



In [14]:

    
np.einsum('ik', b)









    Out[14]:





array([[     0,      1,      2, ...,    997,    998,    999],
       [  1000,   1001,   1002, ...,   1997,   1998,   1999],
       [  2000,   2001,   2002, ...,   2997,   2998,   2999],
       ..., 
       [997000, 997001, 997002, ..., 997997, 997998, 997999],
       [998000, 998001, 998002, ..., 998997, 998998, 998999],
       [999000, 999001, 999002, ..., 999997, 999998, 999999]])



In [ ]:



In [ ]:



In [ ]:

    
cd ../positive_text



In [27]:

    
stopwords = set(nltk.corpus.stopwords.words(fileids='english'))



In [28]:

    
directory = "/Users/michael/Documents/positive_text/"
docs = []
vocab = set(wordvecs.vocab.keys())
for path in os.listdir(directory)[1:]:
    for subpath in os.listdir(directory + path):
        try:
            with open(directory + path + '/' + subpath, 'r') as f:
                txt = f.read()
                txt = txt.split()                   
                txt = [word for word in txt if word in vocab and word.isalpha() 
                      and word not in stopwords and word != "ve"]
                txt = ' '.join(txt)
                if len(txt) > 5: docs.append(txt)
                f.close()
        except IOError: 
            for subsubpath in os.listdir(directory + path + '/' + subpath):
                try:
                    with open(directory + path + '/' + subpath + '/' + subsubpath, 'r') as f:
                        txt = f.read()
                        txt = txt.split()
                        txt = [word for word in txt if word in vocab and word.isalpha() 
                      and word not in stopwords and word != "ve"]
                        txt = ' '.join(txt)
                        docs.append(txt)
                except IOError: 
                    continue



In [ ]:

    
import numpy as np



In [ ]:

    
!touch cleandocs.txt



In [ ]:

    
pwd/



In [29]:

    
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
# np.savetxt(f, docs)
with open(f, 'w') as fi:
    for doc in docs:
        fi.write("%s\n" % doc)



In [30]:

    
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
with open(f, 'r') as fi:
    text = fi.read().splitlines()
    fi.close()



In [33]:

    
len(text)









    Out[33]:





5000



In [ ]:

    
len([doc.split() for doc in docs])



In [ ]:

    
docs[9]



In [ ]:

    
ls



In [ ]:

    
cd Insurance/



In [ ]:

    
with open("+PHI5_Ins-+_113529_SCAN0145.txt", 'r') as f:
    txt = f.read()
f.close()



In [ ]:

    
txt



In [ ]:

    
len(txt)



In [ ]:

    
nltk.tokenize.word_tokenize(txt)



In [41]:

    
np.where(a==4)









    Out[41]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]))



In [2]:

    
np.einsum?



In [ ]: