In [1]:
import numpy as np
from matplotlib import pyplot as plt
from scipy import linalg
In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)
In [40]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import get_data_home
import os
from sklearn.datasets import load_files
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
In [4]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
due to the fact that aws download is very slow for the dataset, i have manually downloaded the dataset and loading into the notebook.
In [36]:
### the below code is from sklearn implementation of preprocessing the news group data
import re
_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'
r'|^In article|^Quoted from|^\||^>)')
def strip_newsgroup_quoting(text):
"""
Given text in "news" format, strip lines beginning with the quote
characters > or |, plus lines that often introduce a quoted section
(for example, because they contain the string 'writes:'.)
"""
good_lines = [line for line in text.split('\n')
if not _QUOTE_RE.search(line)]
return '\n'.join(good_lines)
In [6]:
def strip_newsgroup_footer(text):
"""
Given text in "news" format, attempt to remove a signature block.
As a rough heuristic, we assume that signatures are set apart by either
a blank line or a line made of hyphens, and that it is the last such line
in the file (disregarding blank lines at the end).
"""
lines = text.strip().split('\n')
for line_num in range(len(lines) - 1, -1, -1):
line = lines[line_num]
if line.strip().strip('-') == '':
break
if line_num > 0:
return '\n'.join(lines[:line_num])
else:
return text
In [7]:
TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"
def strip_newsgroup_header(text):
"""
Given text in "news" format, strip the headers, by removing everything
before the first blank line.
"""
_before, _blankline, after = text.partition('\n\n')
return after
def preprocess_fetch_data(categories,remove,subset='train',data_home = None):
data_home = get_data_home(data_home=data_home)
twenty_home = os.path.join(data_home, "20news_home")
train_path = os.path.join(twenty_home, TRAIN_FOLDER)
test_path = os.path.join(twenty_home, TEST_FOLDER)
cache = dict(train=load_files(train_path, encoding='latin1'),
test=load_files(test_path, encoding='latin1'))
if subset in ('train', 'test'):
data = cache[subset]
elif subset == 'all':
data_lst = list()
target = list()
filenames = list()
for subset in ('train', 'test'):
data = cache[subset]
data_lst.extend(data.data)
target.extend(data.target)
filenames.extend(data.filenames)
data.data = data_lst
data.target = np.array(target)
data.filenames = np.array(filenames)
else:
raise ValueError(
"subset can only be 'train', 'test' or 'all', got '%s'" % subset)
data.description = 'the 20 newsgroups by date dataset'
if 'headers' in remove:
data.data = [strip_newsgroup_header(text) for text in data.data]
if 'footers' in remove:
data.data = [strip_newsgroup_footer(text) for text in data.data]
if 'quotes' in remove:
data.data = [strip_newsgroup_quoting(text) for text in data.data]
if categories is not None:
labels = [(data.target_names.index(cat), cat) for cat in categories]
# Sort the categories to have the ordering of the labels
labels.sort()
labels, categories = zip(*labels)
mask = np.in1d(data.target, labels)
data.filenames = data.filenames[mask]
data.target = data.target[mask]
# searchsorted to have continuous labels
data.target = np.searchsorted(labels, data.target)
data.target_names = list(categories)
# Use an object array to shuffle: avoids memory copy
data_lst = np.array(data.data, dtype=object)
data_lst = data_lst[mask]
data.data = data_lst.tolist()
return data
In [8]:
newsgroups_train = preprocess_fetch_data(categories,remove,subset='train')
In [9]:
newsgroups_test = preprocess_fetch_data(categories,remove,subset='test')
In [10]:
newsgroups_train.filenames.shape,newsgroups_train.target.shape
Out[10]:
In [14]:
#print("\n".join(newsgroups_train.data[:3]))
In [11]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:3]]
Out[11]:
In [12]:
newsgroups_train.target[:10]
Out[12]:
In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
In [14]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data).todense()
vectors.shape
Out[14]:
In [15]:
print(len(newsgroups_train.data),vectors.shape)
In [16]:
type(newsgroups_train)
Out[16]:
In [17]:
vocab = np.array(vectorizer.get_feature_names())
In [18]:
U, s, Vh = linalg.svd(vectors,full_matrices=False)
In [21]:
print(U.shape, s.shape, Vh.shape,vectors.shape)
In [23]:
#Exercise: confrim that U, s, Vh is a decomposition of the var Vectors
m,n = vectors.shape
D = np.diag(s)
U.shape,D.shape,Vh.shape
np.allclose(vectors,np.dot(U,np.dot(D,Vh)))
Out[23]:
In [24]:
plt.plot(s);
In [25]:
plt.plot(s[:10])
Out[25]:
In [26]:
num_top_words=8
def show_topics(a):
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
topic_words = ([top_words(t) for t in a])
return [' '.join(t) for t in topic_words]
In [35]:
show_topics(Vh[:10])
Out[35]:
In [38]:
### scikit learn Implemntation of NMF
m,n = vectors.shape
d = 5
clf = decomposition.NMF(n_components=d,random_state=1)
W1 = clf.fit_transform(vectors)
H1 = clf.components_
In [39]:
show_topics(H1)
Out[39]:
In [41]:
tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf = tfidf.fit_transform(newsgroups_train.data)
W1 = clf.fit_transform(vec_tfidf)
H1 = clf.components_
In [46]:
categories
Out[46]:
In [42]:
show_topics(H1)
Out[42]:
In [43]:
plt.plot(H1[0])
Out[43]:
In [44]:
clf.reconstruction_err_
Out[44]:
In [47]:
lam=1e3
lr=1e-2
m, n = vec_tfidf.shape
In [48]:
W1 = clf.fit_transform(vectors)
H1 = clf.components_
In [49]:
show_topics(H1)
Out[49]:
In [50]:
mu = 1e-6
def grads(M, W, H):
R = W@H-M
return R@H.T + penalty(W, mu)*lam, W.T@R + penalty(H, mu)*lam # dW, dH
In [51]:
def penalty(M, mu):
return np.where(M>=mu,0, np.min(M - mu, 0))
In [52]:
def upd(M, W, H, lr):
dW,dH = grads(M,W,H)
W -= lr*dW; H -= lr*dH
In [53]:
def report(M,W,H):
print(np.linalg.norm(M-W@H), W.min(), H.min(), (W<0).sum(), (H<0).sum())
In [54]:
W = np.abs(np.random.normal(scale=0.01, size=(m,d)))
H = np.abs(np.random.normal(scale=0.01, size=(d,n)))
In [57]:
report(vec_tfidf, W, H)
In [59]:
upd(vec_tfidf,W,H,lr)
In [60]:
report(vec_tfidf, W, H)
In [65]:
for i in range(50):
upd(vec_tfidf,W,H,lr)
if i % 10 == 0: report(vec_tfidf,W,H)
In [66]:
show_topics(H)
Out[66]:
In [67]:
import torch
import torch.cuda as tc
from torch.autograd import Variable
In [68]:
def V(M):
return Variable(M,requires_grad = True)
In [69]:
v = vec_tfidf.todense()
In [70]:
t_vec = torch.Tensor(v.astype(np.float32)).cuda()
In [71]:
mu = 1e-5
In [72]:
def grads_t(M, W, H):
R = W.mm(H)-M
return (R.mm(H.t()) + penalty_t(W, mu)*lam,
W.t().mm(R) + penalty_t(H, mu)*lam) # dW, dH
def penalty_t(M, mu):
return (M<mu).type(tc.FloatTensor)*torch.clamp(M - mu, max=0.)
def upd_t(M, W, H, lr):
dW,dH = grads_t(M,W,H)
W.sub_(lr*dW); H.sub_(lr*dH)
def report_t(M,W,H):
print((M-W.mm(H)).norm(2), W.min(), H.min(), (W<0).sum(), (H<0).sum())
In [73]:
t_W = tc.FloatTensor(m,d)
t_H = tc.FloatTensor(d,n)
t_W.normal_(std=0.01).abs_();
t_H.normal_(std=0.01).abs_();
In [74]:
d=6; lam=100; lr=0.05
In [76]:
for i in range(1000):
upd_t(t_vec,t_W,t_H,lr)
if i % 100 == 0:
report_t(t_vec,t_W,t_H)
lr *= 0.9
In [77]:
show_topics(t_H.cpu().numpy())
Out[77]:
In [78]:
plt.plot(t_H.cpu().numpy()[0])
Out[78]:
In [79]:
t_W.mm(t_H).max()
Out[79]:
In [80]:
t_vec.max()
Out[80]:
In [81]:
x = Variable(torch.ones(2, 2), requires_grad=True)
print(x)