In [1]:
%pylab inline
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
import pickle
In [2]:
articles = json.load(open('articles_html1000.json'))
articles = [article for article in articles if article['content'] != []]
In [3]:
article_texts = [' '.join(article['content']) for article in articles]
In [4]:
sections = [article['section_name'] for article in articles]
urls = [article['web_url'] for article in articles]
In [5]:
vectorizer = TfidfVectorizer(min_df=1, stop_words = 'english')
In [6]:
article_word = vectorizer.fit_transform(article_texts)
article_word
Out[6]:
In [7]:
label_article = sparse.csr_matrix(np.zeros((len(set(sections)),len(articles))))
label_article
Out[7]:
In [8]:
label_lookup = dict(zip(set(sections), range(len(set(sections)))))
In [9]:
article_lookup = dict(zip(urls, range(len(urls))))
In [10]:
word_lookup = dict(zip(vectorizer.get_feature_names(), range(len(vectorizer.get_feature_names()))))
len(word_lookup)
Out[10]:
In [11]:
for article in articles:
label_article[label_lookup[article['section_name']], article_lookup[article['web_url']]] = 1
label_article
Out[11]:
In [12]:
cpt = label_article*article_word
cpt
Out[12]:
In [13]:
ij = zip(cpt.nonzero()[0], cpt.nonzero()[1])
norms = cpt.sum(axis=1)
for i,j in ij:
cpt[i,j] = log(cpt[i,j]/norms[i][0,0])
In [14]:
priors = zeros((len(label_lookup)))
for section in sections:
priors[label_lookup[section]] += 1
priors = matrix(log(priors/priors.sum())).transpose()
In [15]:
import pickle
In [16]:
pickle.dump(vectorizer, open('vectorizer.pickle','w'))
pickle.dump(cpt, open('cpt.pickle','w'))
pickle.dump(priors, open('priors.pickle','w'))
pickle.dump(label_lookup, open('label_lookup.pickle','w'))