In [1]:
from __future__ import division
import numpy as np
from gensim.models import Word2Vec
import scipy.stats as stat
from scipy.special import gamma
import scipy
from numpy import pi
from numba import jit, vectorize, float32, float64
import time
import line_profiler
import nltk
import os
import cProfile
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)
%load_ext line_profiler
%load_ext autoreload
In [26]:
wordvecs = Word2Vec.load_word2vec_format(
"/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)
In [ ]:
len(wordvecs.vocab.keys())
In [ ]:
cd GaussianLDA/
In [56]:
import FastGaussianLDA
%aimport FastGaussianLDA
%autoreload 1
# import GaussianLDA
# reload(Gauss_LDA)
# reload(GaussianLDA)
# corpus = ['apple banana mango melon',
# 'barrel cannon gun war',
# 'democrat republican senate politics']
corpus2 = ["apple orange mango melon ", "canvas art mural paint painting ", "pineapple kiwi grape strawberry ",
"picture frame picasso sculpture art ", "coconut guava blueberry blackberry ", "statue monument art artist "]
# corpus2 = [sent * 5 for sent in corpus2]*4
wordvec_fileapth = "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt"
start = time.time()
g = FastGaussianLDA.Gauss_LDA(2, corpus2, word_vector_model=wordvecs)
g.fit(20)
print (time.time()-start)
old time was 237 secs on 30 iterations without NUMBA
In [49]:
g.topic_params
Out[49]:
In [50]:
np.savetxt("/Users/michael/Documents/GaussianLDA/output/test.txt", g.topic_params[0]["Topic Mean"])
In [ ]:
In [ ]:
mean1 = g.topic_params[0]["Topic Mean"]; mean2 = g.topic_params[1]["Topic Mean"]; mean3 = g.topic_params[2]["Topic Mean"]
wordvecs.most_similar(positive=[mean1])
In [ ]:
wordvecs.most_similar(positive=[mean2])
In [ ]:
wordvecs.most_similar(positive=[mean3])
In [ ]:
newdoc = "fruit apple orange mango taco painting sculpture book art artist picasso"
In [ ]:
def myfunc(mat):
np.linalg.det(mat)
return np.linalg.inv(mat)
%prun myfunc(np.arange(100).reshape(10,10))
In [ ]:
import numpy as np
In [3]:
a = np.cov(np.arange(50.0**2, dtype=np.float32).reshape(50,50))
a = np.eye(50.)*4
# scipy.linalg.cholesky(a, lower=True, overwrite_a=True)
x = np.arange(50)
In [4]:
from numba import generated_jit, vectorize
In [5]:
b = np.arange(1000*1000).reshape(1000,1000)
In [56]:
@jit(cache=True)
def func(val):
np.sum(val, axis=0)
In [57]:
%%timeit
func(b)
In [58]:
%%timeit
def func(val):
np.sum(val, axis=0)
In [ ]:
@jit((float32[:,:]))
def chol_downdate(L, X):
"""
Cholesky Rank 1 Update
:param L: Lower triangle matrix from Cholesky Decomposition
:param X: Word Vector with same column dimensionality as L
:return: updated lower triangle matrix
"""
# assert L.shape[1] == X.shape[0]
# choldowndate(L.T, X)
L_c = np.copy(L) # in-place computations are faster
for k in range(X.shape[0]):
r = np.sqrt(L_c[k, k]**2 - X[k]**2)
c = r / L_c[k, k]
s = X[k] / L_c[k, k]
L_c[k, k] = r
for i in range(k+1, X.shape[0]):
L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
X[i] = (c * X[i]) - (s * L_c[i, k])
if np.isnan(r) or np.isinf(r):
print "YOU GOT Nans or infs: learn to code better shmuck"
return L # good reason for making copy - return if downdate becomes unstable
return L_c
In [ ]:
%%timeit
chol_downdate(a, x)
In [ ]:
@jit
def chol_downdate(L, X):
"""
Cholesky Rank 1 Update
:param L: Lower triangle matrix from Cholesky Decomposition
:param X: Word Vector with same column dimensionality as L
:return: updated lower triangle matrix
"""
assert L.shape[1] == X.shape[0]
# choldowndate(L.T, X)
L_c = np.copy(L) # in-place computations are faster
for k in range(X.shape[0]):
r = np.sqrt(np.power(L_c[k, k],2) - np.power(X[k], 2))
c = r / L_c[k, k]
s = X[k] / L_c[k, k]
L_c[k, k] = r
for i in range(k+1, X.shape[0]):
L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
X[i] = (c * X[i]) - (s * L_c[i, k])
if np.isnan(r) or np.isinf(r):
print "YOU GOT Nans or infs: learn to code better shmuck"
return L # good reason for making copy - return if downdate becomes unstable
return L_c
In [ ]:
%%timeit
chol_downdate(a, x)
In [9]:
np.sum(b, axis=0)
Out[9]:
In [14]:
np.einsum('ik', b)
Out[14]:
In [ ]:
In [ ]:
In [ ]:
cd ../positive_text
In [27]:
stopwords = set(nltk.corpus.stopwords.words(fileids='english'))
In [28]:
directory = "/Users/michael/Documents/positive_text/"
docs = []
vocab = set(wordvecs.vocab.keys())
for path in os.listdir(directory)[1:]:
for subpath in os.listdir(directory + path):
try:
with open(directory + path + '/' + subpath, 'r') as f:
txt = f.read()
txt = txt.split()
txt = [word for word in txt if word in vocab and word.isalpha()
and word not in stopwords and word != "ve"]
txt = ' '.join(txt)
if len(txt) > 5: docs.append(txt)
f.close()
except IOError:
for subsubpath in os.listdir(directory + path + '/' + subpath):
try:
with open(directory + path + '/' + subpath + '/' + subsubpath, 'r') as f:
txt = f.read()
txt = txt.split()
txt = [word for word in txt if word in vocab and word.isalpha()
and word not in stopwords and word != "ve"]
txt = ' '.join(txt)
docs.append(txt)
except IOError:
continue
In [ ]:
import numpy as np
In [ ]:
!touch cleandocs.txt
In [ ]:
pwd/
In [29]:
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
# np.savetxt(f, docs)
with open(f, 'w') as fi:
for doc in docs:
fi.write("%s\n" % doc)
In [30]:
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
with open(f, 'r') as fi:
text = fi.read().splitlines()
fi.close()
In [33]:
len(text)
Out[33]:
In [ ]:
len([doc.split() for doc in docs])
In [ ]:
docs[9]
In [ ]:
ls
In [ ]:
cd Insurance/
In [ ]:
with open("+PHI5_Ins-+_113529_SCAN0145.txt", 'r') as f:
txt = f.read()
f.close()
In [ ]:
txt
In [ ]:
len(txt)
In [ ]:
nltk.tokenize.word_tokenize(txt)
In [41]:
np.where(a==4)
Out[41]:
In [2]:
np.einsum?
In [ ]: