In [1]:
import os.path

from collections import Counter
from random import sample

import pandas as pd
import numpy as np

from lda import LDA, datasets as lda_datasets

from newsbreaker.data import load_entries, save_entries

In [2]:
import os.path
from sklearn.externals import joblib

try:
    topic_model = joblib.load(os.path.join('topic_model', 'topic_model.pkl'))
    
    with open(os.path.join('topic_model', 'vocab.txt')) as f:
        vocab = f.read().split('\n')
except:
    vocab = list(lda_datasets.load_reuters_vocab())
    print('Model not loaded')


Model not loaded

In [3]:
folder = os.path.join('..', 'data')

In [4]:
entries = load_entries(folder)

In [5]:
def build_features(entry):
    doc = entry.doc(False, False, False)

    counter = Counter(
        word.lower_
        for word in doc
    )
    
    return np.array([ counter[word] for word in vocab ])

In [6]:
X = [
    build_features(entry)
    for entry in entries
]

In [7]:
# Only use documents that have at least a word in vocab
X = np.array([ row for row in X if row.sum() ])

In [8]:
# Get indexes of words in vocab that don't show up in any traning document
drop_vocab_indexs = {
    i
    for i, col in enumerate(X.T)
    if not col.sum()
}

# Drop discarded words from vocab
# Now build_features will use only the not discarded
for i in sorted(drop_vocab_indexs, reverse=True):
    vocab.pop(i)

In [9]:
# Drop any all-zero columns, identified by drop_vocab_indexs
X = np.array([ col for n, col in enumerate(X.T) if n not in drop_vocab_indexs]).T
del drop_vocab_indexs

In [10]:
# Create and fit model
topic_model = LDA(n_topics=20)
topic_model.fit(X)


Out[10]:
<lda.lda.LDA at 0x167c077f0>

In [11]:
test_entries = sample(entries, 100)
test_vectors = np.array([
    build_features(entry)
    for entry in test_entries
])

In [12]:
test_Y = topic_model.transform(test_vectors)


WARNING:lda:all zero column in document-term matrix found

In [13]:
df = pd.DataFrame(
    [
        [res.argmax(), entry.feedname, entry.index, entry.title]
        for entry, res in map(
            lambda pair: (test_entries[pair[0]], pair[1]), 
            enumerate(test_Y)
        )
    ],
    columns=['topic', 'feedname', 'index', 'title']
)

In [14]:
df.sort_values(by='topic')


Out[14]:
topic feedname index title
86 0 ChicagoTribune 2625 Serbia's PM condemns 'brutal treatment' of mig...
69 0 NYTimes 1956 ISIS Damages Temple of Baal in Palmyra
59 0 StLouisPost 13451 Obama, Indonesian leader expected to discuss I...
27 0 WallStreetJournal 647 Stymied at Channel, Migrants With Money Turn t...
6 0 USAToday 6968 Canadian PM Justin Trudeau says cabinet is hal...
14 1 ChicagoTribune 4496 Walgreens nears deal to buy Rite Aid: report
13 1 LATimes 15042 Six Flags Entertainment reports attendance jum...
32 1 StLouisPost 1053 The rogues gallery of accounting scandals thro...
20 1 NYTimes 287 Apple Waits as App Developers Study Who’s Buyi...
53 1 DailyNews 4608 Surgeons make the most money: survey
36 1 NYTimes 93 Amazon Dreams Up a Sale in a Calendar Bursting...
62 2 NYPost 7474 6 performers who could complete the EGOT
44 2 LATimes 2367 'Happy Birthday' song could soon be free
1 2 LATimes 12791 Steven Spielberg's 'Bridge of Spies': Can it c...
0 2 LATimes 10449 Emmy parties: Oceans of gold — and Amy Schumer...
73 3 ChicagoTribune 4399 Father of Oklahoma parade crash suspect: 'This...
30 3 DailyNews 2807 Youth basketball team attacks homeless with br...
66 3 BostonHerald 2533 New Bedford man dead, second man facing charges
67 3 ChicagoTribune 2750 2 shot following dispute in South Side restaurant
72 3 NYPost 12391 The murder of Officer Randolph Holder was an a...
47 3 DailyNews 3203 Six children, two adults found shot in head at...
95 3 StLouisPost 6415 Cousins indicted for shooting up empty histori...
93 3 WashingtonPost 1669 November trial set for former University of Ci...
81 3 LATimes 10341 Beverly Hills probes Ferrari race, but Qatari ...
7 3 NYPost 13478 Guy cited for manspreading on subway was wante...
71 4 LATimes 17532 USC Now morning report: Trojans get physical
55 4 NYPost 13696 It might take a few weeks for Carmelo Anthony ...
68 4 NYTimes 1262 Jets Coach Todd Bowles on Fight
38 4 LATimes 9869 College football viewer's guide: Game times, c...
29 4 ChicagoTribune 2404 Blackhawks trade Kris Versteeg, Joakim Nordstr...
... ... ... ... ...
2 12 LATimes 15657 Hacienda-style house blends old and new
61 12 NYPost 11698 Thieves break through walls in ‘Ocean’s 11’-st...
77 12 LATimes 12938 Under a veil of darkness, Oklahoma removed the...
39 13 DailyNews 8197 Maine mayor wants to list welfare recipients o...
85 13 NYPost 11905 Ex-journalist repays mag for phony article
25 13 DailyNews 5188 Nestle makes anti-forced labor vow amid allega...
94 13 NYTimes 4128 Amazon Spars With The Times Over Investigative...
3 15 ChicagoTribune 3619 Live blog: Cubs at Pirates and Blackhawks vs. ...
19 15 LATimes 5303 Dodgers' Kenley Jansen goes from son's birth t...
76 15 NYPost 1693 AL ‘contenders’ are fakes if they can’t reach ...
23 15 BostonHerald 9695 Aces wild: Playoffs open with Astros-Yankees, ...
40 15 BostonHerald 4744 Royals&#039; 9th-inning rally buries Red Sox
96 16 NYPost 9961 Suspect in UVA killing gets 3 life terms for 2...
51 16 ChicagoTribune 1356 Northbrook man pleads guilty in large-scale 's...
28 16 StLouisPost 14388 Trial set to start for indicted ex-NY Assembly...
91 16 DailyNews 12034 Sheldon Silver was never offered plea deal: pr...
5 16 LATimes 7598 L.A.-based trial attorney accused of misapprop...
97 17 BostonHerald 7791 Firefighters battle 7-alarm blaze at vacant Ha...
54 17 StLouisPost 12055 NOAA: Thanks to El Nino, the US looks pretty w...
49 17 StLouisPost 7455 Explosive wildfire threatens California mounta...
37 17 NYPost 13679 Death toll expected to rise in Romanian nightc...
10 17 LATimes 14494 L.A. County Sheriff's Department says dogs suf...
18 17 WashingtonPost 2618 Mexico&#8217;s new plan to protect endangered ...
70 18 BostonHerald 12291 Chesney playing Pittsburgh a 2nd time since fa...
4 19 LATimes 10810 A call for compassion as End of Life Option Ac...
57 19 LATimes 5479 L.A. leaders' lofty goals are at odds with rea...
26 19 NYPost 4830 Carl Heastie stands up for common sense on Ind...
52 19 WashingtonPost 4788 Six amazing science stories that are absolutel...
78 19 LATimes 4317 'Soft'? 'Hot'? 'Clean'? We help you decode the...
89 19 BostonHerald 2662 5 key points from Bobby Jindal&#039;s intervie...

100 rows × 4 columns

 Save model


In [15]:
joblib.dump(topic_model, os.path.join('topic_model', 'topic_model.pkl'))


Out[15]:
['model_data/topic_model.pkl',
 'model_data/topic_model.pkl_01.npy',
 'model_data/topic_model.pkl_02.npy',
 'model_data/topic_model.pkl_03.npy',
 'model_data/topic_model.pkl_04.npy',
 'model_data/topic_model.pkl_05.npy',
 'model_data/topic_model.pkl_06.npy',
 'model_data/topic_model.pkl_07.npy']

In [16]:
with open(os.path.join('topic_model', 'vocab.txt'), 'w') as f:
    f.write('\n'.join(vocab))