In [1]:
import numpy as np
from matplotlib import pyplot as plt
from scipy import linalg

In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [40]:
from  sklearn.datasets import fetch_20newsgroups 
from sklearn.datasets import get_data_home
import os
from sklearn.datasets import load_files
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')

Load data from existing files

due to the fact that aws download is very slow for the dataset, i have manually downloaded the dataset and loading into the notebook.


In [36]:
### the below code is from sklearn implementation of preprocessing the news group data
import re

_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'
                       r'|^In article|^Quoted from|^\||^>)')
def strip_newsgroup_quoting(text):
    """
    Given text in "news" format, strip lines beginning with the quote
    characters > or |, plus lines that often introduce a quoted section
    (for example, because they contain the string 'writes:'.)
    """
    good_lines = [line for line in text.split('\n')
                  if not _QUOTE_RE.search(line)]
    return '\n'.join(good_lines)

In [6]:
def strip_newsgroup_footer(text):
    """
    Given text in "news" format, attempt to remove a signature block.

    As a rough heuristic, we assume that signatures are set apart by either
    a blank line or a line made of hyphens, and that it is the last such line
    in the file (disregarding blank lines at the end).
    """
    lines = text.strip().split('\n')
    for line_num in range(len(lines) - 1, -1, -1):
        line = lines[line_num]
        if line.strip().strip('-') == '':
            break

    if line_num > 0:
        return '\n'.join(lines[:line_num])
    else:
        return text

In [7]:
TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"
def strip_newsgroup_header(text):
    """
    Given text in "news" format, strip the headers, by removing everything
    before the first blank line.
    """
    _before, _blankline, after = text.partition('\n\n')
    return after
    
def preprocess_fetch_data(categories,remove,subset='train',data_home = None):
    data_home = get_data_home(data_home=data_home)
    twenty_home = os.path.join(data_home, "20news_home")
    train_path = os.path.join(twenty_home, TRAIN_FOLDER)
    test_path = os.path.join(twenty_home, TEST_FOLDER)
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))
    if subset in ('train', 'test'):
        data = cache[subset]
    elif subset == 'all':
        data_lst = list()
        target = list()
        filenames = list()
        for subset in ('train', 'test'):
            data = cache[subset]
            data_lst.extend(data.data)
            target.extend(data.target)
            filenames.extend(data.filenames)

        data.data = data_lst
        data.target = np.array(target)
        data.filenames = np.array(filenames)
    else:
        raise ValueError(
            "subset can only be 'train', 'test' or 'all', got '%s'" % subset)

    data.description = 'the 20 newsgroups by date dataset'

    if 'headers' in remove:
        data.data = [strip_newsgroup_header(text) for text in data.data]
    if 'footers' in remove:
        data.data = [strip_newsgroup_footer(text) for text in data.data]
    if 'quotes' in remove:
        data.data = [strip_newsgroup_quoting(text) for text in data.data]

    if categories is not None:
        labels = [(data.target_names.index(cat), cat) for cat in categories]
        # Sort the categories to have the ordering of the labels
        labels.sort()
        labels, categories = zip(*labels)
        mask = np.in1d(data.target, labels)
        data.filenames = data.filenames[mask]
        data.target = data.target[mask]
        # searchsorted to have continuous labels
        data.target = np.searchsorted(labels, data.target)
        data.target_names = list(categories)
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[mask]
        data.data = data_lst.tolist()

    return data

In [8]:
newsgroups_train = preprocess_fetch_data(categories,remove,subset='train')

In [9]:
newsgroups_test = preprocess_fetch_data(categories,remove,subset='test')

In [10]:
newsgroups_train.filenames.shape,newsgroups_train.target.shape


Out[10]:
((2034,), (2034,))

In [14]:
#print("\n".join(newsgroups_train.data[:3]))

In [11]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:3]]


Out[11]:
array(['alt.atheism', 'sci.space', 'comp.graphics'],
      dtype='<U18')

In [12]:
newsgroups_train.target[:10]


Out[12]:
array([0, 2, 1, 0, 2, 3, 0, 0, 2, 2])

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [14]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data).todense()
vectors.shape


Out[14]:
(2034, 26576)

In [15]:
print(len(newsgroups_train.data),vectors.shape)


2034 (2034, 26576)

In [16]:
type(newsgroups_train)


Out[16]:
sklearn.utils.Bunch

In [17]:
vocab = np.array(vectorizer.get_feature_names())

SVD(Singular Value Decomposition)


In [18]:
U, s, Vh = linalg.svd(vectors,full_matrices=False)

In [21]:
print(U.shape, s.shape, Vh.shape,vectors.shape)


(2034, 2034) (2034,) (2034, 26576) (2034, 26576)

In [23]:
#Exercise: confrim that U, s, Vh is a decomposition of the var Vectors
m,n = vectors.shape
D = np.diag(s)
U.shape,D.shape,Vh.shape
np.allclose(vectors,np.dot(U,np.dot(D,Vh)))


Out[23]:
True

In [24]:
plt.plot(s);



In [25]:
plt.plot(s[:10])


Out[25]:
[<matplotlib.lines.Line2D at 0x7feeb2a794e0>]

In [26]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [35]:
show_topics(Vh[:10])


Out[35]:
['critus ditto propagandist surname galacticentric kindergarten surreal imaginative',
 'edu graphics data space pub mail 128 3d',
 'space jesus launch god people satellite matthew atheists',
 'space launch satellite commercial nasa satellites market year',
 'jpeg graphics space pub edu ray mail send',
 'jesus matthew prophecy messiah psalm isaiah david said',
 'launch commercial satellite market image services satellites launches',
 'image probe surface lunar mars probes moon orbit',
 'argument fallacy conclusion example true ad argumentum premises',
 'probe data surface moon mars probes lunar launch']

NMF for topic modelling


In [38]:
### scikit learn Implemntation of NMF
m,n = vectors.shape
d = 5
clf = decomposition.NMF(n_components=d,random_state=1)
W1 = clf.fit_transform(vectors)
H1 = clf.components_

In [39]:
show_topics(H1)


Out[39]:
['jpeg image gif file color images format quality',
 'edu graphics pub mail 128 ray ftp send',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists does atheism said',
 'image data available software processing ftp edu analysis']

TF-IDF for topic modelling


In [41]:
tfidf = TfidfVectorizer(stop_words='english')
vec_tfidf = tfidf.fit_transform(newsgroups_train.data)
W1 = clf.fit_transform(vec_tfidf)
H1 = clf.components_

In [46]:
categories


Out[46]:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

In [42]:
show_topics(H1)


Out[42]:
['people don think just like objective say morality',
 'graphics thanks files image file program windows know',
 'space nasa launch shuttle orbit moon lunar earth',
 'ico bobbe tek beauchaine bronx manhattan sank queens',
 'god jesus bible believe christian atheism does belief']

In [43]:
plt.plot(H1[0])


Out[43]:
[<matplotlib.lines.Line2D at 0x7feec9be61d0>]

In [44]:
clf.reconstruction_err_


Out[44]:
43.712926058020408

NMF from scratch in numpy using SGD


In [47]:
lam=1e3
lr=1e-2
m, n = vec_tfidf.shape

In [48]:
W1 = clf.fit_transform(vectors)
H1 = clf.components_

In [49]:
show_topics(H1)


Out[49]:
['jpeg image gif file color images format quality',
 'edu graphics pub mail 128 ray ftp send',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists does atheism said',
 'image data available software processing ftp edu analysis']

In [50]:
mu = 1e-6
def grads(M, W, H):
    R = W@H-M
    return R@H.T + penalty(W, mu)*lam, W.T@R + penalty(H, mu)*lam # dW, dH

In [51]:
def penalty(M, mu):
    return np.where(M>=mu,0, np.min(M - mu, 0))

In [52]:
def upd(M, W, H, lr):
    dW,dH = grads(M,W,H)
    W -= lr*dW; H -= lr*dH

In [53]:
def report(M,W,H): 
    print(np.linalg.norm(M-W@H), W.min(), H.min(), (W<0).sum(), (H<0).sum())

In [54]:
W = np.abs(np.random.normal(scale=0.01, size=(m,d)))
H = np.abs(np.random.normal(scale=0.01, size=(d,n)))

In [57]:
report(vec_tfidf, W, H)


44.4259374728 1.27068916306e-07 4.76794046496e-08 0 0

In [59]:
upd(vec_tfidf,W,H,lr)

In [60]:
report(vec_tfidf, W, H)


44.4180500987 -0.000721608595467 -7.22365555113e-05 161 278

In [65]:
for i in range(50): 
    upd(vec_tfidf,W,H,lr)
    if i % 10 == 0: report(vec_tfidf,W,H)


44.2428382591 -9.60396323815e-05 -0.000128273147195 38 3081
44.2117699642 -8.93682416765e-05 -0.000165754324551 31 3925
44.1863266994 -0.000126113870878 -0.000157219085405 47 4644
44.1662683889 -0.000179765234281 -0.000143326736203 67 5588
44.1502581862 -0.000194625849988 -0.000136445603008 67 6317

In [66]:
show_topics(H)


Out[66]:
['people space like just don god think know',
 'god don people just think does like jesus',
 'god just space don people think know like',
 'god people don think know space just like',
 'space know people like think just don god']

PyTorch to create NMF


In [67]:
import torch
import torch.cuda as tc
from torch.autograd import Variable

In [68]:
def V(M):
    return Variable(M,requires_grad = True)

In [69]:
v = vec_tfidf.todense()

In [70]:
t_vec = torch.Tensor(v.astype(np.float32)).cuda()

In [71]:
mu = 1e-5

In [72]:
def grads_t(M, W, H):
    R = W.mm(H)-M
    return (R.mm(H.t()) + penalty_t(W, mu)*lam, 
        W.t().mm(R) + penalty_t(H, mu)*lam) # dW, dH

def penalty_t(M, mu):
    return (M<mu).type(tc.FloatTensor)*torch.clamp(M - mu, max=0.)

def upd_t(M, W, H, lr):
    dW,dH = grads_t(M,W,H)
    W.sub_(lr*dW); H.sub_(lr*dH)

def report_t(M,W,H): 
    print((M-W.mm(H)).norm(2), W.min(), H.min(), (W<0).sum(), (H<0).sum())

In [73]:
t_W = tc.FloatTensor(m,d)
t_H = tc.FloatTensor(d,n)
t_W.normal_(std=0.01).abs_(); 
t_H.normal_(std=0.01).abs_();

In [74]:
d=6; lam=100; lr=0.05

In [76]:
for i in range(1000): 
    upd_t(t_vec,t_W,t_H,lr)
    if i % 100 == 0: 
        report_t(t_vec,t_W,t_H)
        lr *= 0.9


44.3938102722168 -0.004070318304002285 -0.00039740739157423377 768 1357
43.76396560668945 -0.009402437135577202 -0.01516595296561718 1540 17605
43.73957443237305 -0.004607424605637789 -0.007575209718197584 1761 18589
43.737876892089844 -0.005591616965830326 -0.00937887467443943 1768 19057
43.737552642822266 -0.005662467330694199 -0.009697899222373962 1903 19531
43.73701095581055 -0.003180568339303136 -0.0032958541996777058 2386 21032
43.73679733276367 -0.0030161826871335506 -0.005725307390093803 2454 21942
43.73655319213867 -0.0036901137791574 -0.005659892689436674 2359 22763
43.73644256591797 -0.003632614854723215 -0.005929608829319477 2297 22852
43.73622131347656 -0.003584711579605937 -0.005387790035456419 2609 29031

In [77]:
show_topics(t_H.cpu().numpy())


Out[77]:
['objective morality values moral subjective science absolute claim',
 'space nasa launch shuttle orbit lunar moon earth',
 'god jesus bible believe atheism belief christian does',
 'thanks graphics files image file program windows know',
 'don people just think like know say religion']

In [78]:
plt.plot(t_H.cpu().numpy()[0])


Out[78]:
[<matplotlib.lines.Line2D at 0x7feeb34f6828>]

In [79]:
t_W.mm(t_H).max()


Out[79]:
0.40214815735816956

In [80]:
t_vec.max()


Out[80]:
1.0

PyTorch AutoGrad.


In [81]:
x = Variable(torch.ones(2, 2), requires_grad=True)
print(x)


Variable containing:
 1  1
 1  1
[torch.FloatTensor of size 2x2]