In [1]:
from __future__ import division
import numpy as np
from gensim.models import Word2Vec
import scipy.stats as stat
from scipy.special import gamma
import scipy
from numpy import pi
from numba import jit, vectorize, float32, float64
import time
import line_profiler
import nltk
import os
import cProfile
import IPython
ip = IPython.get_ipython()
ip.define_magic('lprun', line_profiler.magic_lprun)
%load_ext line_profiler
%load_ext autoreload
In [2]:
wordvecs = Word2Vec.load_word2vec_format(
"/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)
In [ ]:
len(wordvecs.vocab.keys())
In [ ]:
cd GaussianLDA/
In [86]:
import FastGaussianLDA
%aimport FastGaussianLDA
%autoreload 1
# import GaussianLDA
# reload(Gauss_LDA)
# reload(GaussianLDA)
# corpus = ['apple banana mango melon',
# 'barrel cannon gun war',
# 'democrat republican senate politics']
corpus2 = ["apple orange mango melon ", "canvas art mural paint painting ", "pineapple kiwi grape strawberry ",
"picture frame picasso sculpture art ", "coconut guava blueberry blackberry ", "statue monument art artist "]
# corpus2 = [sent * 5 for sent in corpus2]*4
wordvec_fileapth = "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt"
start = time.time()
g = FastGaussianLDA.Gauss_LDA(2, corpus2, word_vector_model=wordvecs)
g.fit(20)
print (time.time()-start)
old time was 237 secs on 30 iterations without NUMBA
In [58]:
np.loadtxt("/Users/michael/Documents/GaussianLDA/output/iter10topic0Topic Mean.txt")
Out[58]:
In [50]:
np.savetxt("/Users/michael/Documents/GaussianLDA/output/test.txt", g.topic_params[0]["Topic Mean"])
In [ ]:
In [ ]:
mean1 = g.topic_params[0]["Topic Mean"]; mean2 = g.topic_params[1]["Topic Mean"]; mean3 = g.topic_params[2]["Topic Mean"]
wordvecs.most_similar(positive=[mean1])
In [ ]:
wordvecs.most_similar(positive=[mean2])
In [ ]:
wordvecs.most_similar(positive=[mean3])
In [ ]:
newdoc = "fruit apple orange mango taco painting sculpture book art artist picasso"
In [ ]:
def myfunc(mat):
np.linalg.det(mat)
return np.linalg.inv(mat)
%prun myfunc(np.arange(100).reshape(10,10))
In [ ]:
import numpy as np
In [3]:
a = np.cov(np.arange(50.0**2, dtype=np.float32).reshape(50,50))
a = np.eye(50.)*4
# scipy.linalg.cholesky(a, lower=True, overwrite_a=True)
x = np.arange(50)
In [4]:
from numba import generated_jit, vectorize
In [5]:
b = np.arange(1000*1000).reshape(1000,1000)
In [56]:
@jit(cache=True)
def func(val):
np.sum(val, axis=0)
In [57]:
%%timeit
func(b)
In [58]:
%%timeit
def func(val):
np.sum(val, axis=0)
In [ ]:
@jit((float32[:,:]))
def chol_downdate(L, X):
"""
Cholesky Rank 1 Update
:param L: Lower triangle matrix from Cholesky Decomposition
:param X: Word Vector with same column dimensionality as L
:return: updated lower triangle matrix
"""
# assert L.shape[1] == X.shape[0]
# choldowndate(L.T, X)
L_c = np.copy(L) # in-place computations are faster
for k in range(X.shape[0]):
r = np.sqrt(L_c[k, k]**2 - X[k]**2)
c = r / L_c[k, k]
s = X[k] / L_c[k, k]
L_c[k, k] = r
for i in range(k+1, X.shape[0]):
L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
X[i] = (c * X[i]) - (s * L_c[i, k])
if np.isnan(r) or np.isinf(r):
print "YOU GOT Nans or infs: learn to code better shmuck"
return L # good reason for making copy - return if downdate becomes unstable
return L_c
In [ ]:
%%timeit
chol_downdate(a, x)
In [ ]:
@jit
def chol_downdate(L, X):
"""
Cholesky Rank 1 Update
:param L: Lower triangle matrix from Cholesky Decomposition
:param X: Word Vector with same column dimensionality as L
:return: updated lower triangle matrix
"""
assert L.shape[1] == X.shape[0]
# choldowndate(L.T, X)
L_c = np.copy(L) # in-place computations are faster
for k in range(X.shape[0]):
r = np.sqrt(np.power(L_c[k, k],2) - np.power(X[k], 2))
c = r / L_c[k, k]
s = X[k] / L_c[k, k]
L_c[k, k] = r
for i in range(k+1, X.shape[0]):
L_c[i, k] = (L_c[i, k] - (s * X[i])) / c
X[i] = (c * X[i]) - (s * L_c[i, k])
if np.isnan(r) or np.isinf(r):
print "YOU GOT Nans or infs: learn to code better shmuck"
return L # good reason for making copy - return if downdate becomes unstable
return L_c
In [ ]:
%%timeit
chol_downdate(a, x)
In [9]:
np.sum(b, axis=0)
Out[9]:
In [14]:
np.einsum('ik', b)
Out[14]:
In [ ]:
In [115]:
w = "word"
len(w)
Out[115]:
In [ ]:
cd ../positive_text
In [3]:
stopwords = set(nltk.corpus.stopwords.words(fileids='english'))
In [ ]:
In [4]:
directory = "/Users/michael/Documents/positive_text/"
docs = []
vocab = set(wordvecs.vocab.keys())
for path in os.listdir(directory)[1:]:
for subpath in os.listdir(directory + path):
try:
with open(directory + path + '/' + subpath, 'r') as f:
txt = f.read()
txt = txt.split()
txt = [word for word in txt if word in vocab
and word.isalpha()
and word not in stopwords
and word != "ve"
and len(word) > 1]
txt = ' '.join(txt)
if len(txt) > 5: docs.append(txt)
f.close()
except IOError:
for subsubpath in os.listdir(directory + path + '/' + subpath):
try:
with open(directory + path + '/' + subpath + '/' + subsubpath, 'r') as f:
txt = f.read()
txt = txt.split()
txt = [word for word in txt if word in vocab
and word.isalpha()
and word not in stopwords
and word != "ve"
and len(word)>1]
txt = ' '.join(txt)
docs.append(txt)
except IOError:
continue
In [ ]:
import numpy as np
In [ ]:
!touch cleandocs.txt
In [ ]:
pwd/
In [6]:
from operator import itemgetter
In [9]:
subsample = itemgetter(*np.random.random_integers(0, 1650, 500).tolist())
In [ ]:
In [11]:
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
# np.savetxt(f, docs)
with open(f, 'w') as fi:
for doc in subsample(docs):
fi.write("%s\n" % doc)
In [30]:
f = '/Users/michael/Documents/GaussianLDA/cleandocs.txt'
with open(f, 'r') as fi:
text = fi.read().splitlines()
fi.close()
In [ ]:
cd Insurance/
Proof that my method of doing np.sum on a array of lists was slower than alt methods
In [98]:
b= np.random.random_sample(size=(10000, 50))
In [99]:
%%timeit
c = np.zeros(50)
for line in b:
c += line
In [100]:
%%timeit
arrs = []
for line in b:
arrs.append(line)
d = np.array(arrs)
np.sum(d, axis=0)
In [105]:
a = {"a":1, "b": 2}
In [107]:
[k for k, v in a.iteritems() if v == 1]
Out[107]:
In [109]:
b = set([1, 2, 3])
c = set([2, 3,4,5])
In [114]:
d = set(["children", "contact", "holla"])
In [113]:
(docs[1], docs[2])
Out[113]:
In [2]:
from sklearn.cluster import KMeans
In [3]:
km = KMeans(n_clusters=15, n_jobs=-1)
In [ ]: