In [1]:
%pylab inline
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
import pickle


Populating the interactive namespace from numpy and matplotlib

In [2]:
articles = json.load(open('articles_html1000.json'))
articles = [article for article in articles if article['content'] != []]

In [3]:
article_texts = [' '.join(article['content']) for article in articles]

In [4]:
sections = [article['section_name'] for article in articles]
urls = [article['web_url'] for article in articles]

In [5]:
vectorizer = TfidfVectorizer(min_df=1, stop_words = 'english')

In [6]:
article_word = vectorizer.fit_transform(article_texts)
article_word


Out[6]:
<985x37487 sparse matrix of type '<type 'numpy.float64'>'
	with 231502 stored elements in Compressed Sparse Row format>

In [7]:
label_article = sparse.csr_matrix(np.zeros((len(set(sections)),len(articles))))
label_article


Out[7]:
<26x985 sparse matrix of type '<type 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [8]:
label_lookup = dict(zip(set(sections), range(len(set(sections)))))

In [9]:
article_lookup = dict(zip(urls, range(len(urls))))

In [10]:
word_lookup = dict(zip(vectorizer.get_feature_names(), range(len(vectorizer.get_feature_names()))))
len(word_lookup)


Out[10]:
37487

In [11]:
for article in articles:
    label_article[label_lookup[article['section_name']], article_lookup[article['web_url']]] = 1
label_article


/usr/lib/python2.7/dist-packages/scipy/sparse/compressed.py:486: SparseEfficiencyWarning: changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  SparseEfficiencyWarning)
Out[11]:
<26x985 sparse matrix of type '<type 'numpy.float64'>'
	with 985 stored elements in Compressed Sparse Row format>

In [12]:
cpt = label_article*article_word
cpt


Out[12]:
<26x37487 sparse matrix of type '<type 'numpy.float64'>'
	with 106610 stored elements in Compressed Sparse Row format>

In [13]:
ij = zip(cpt.nonzero()[0], cpt.nonzero()[1])
norms = cpt.sum(axis=1)
for i,j in ij:
    cpt[i,j] = log(cpt[i,j]/norms[i][0,0])

In [14]:
priors = zeros((len(label_lookup)))

for section in sections:
    priors[label_lookup[section]] += 1
    
priors = matrix(log(priors/priors.sum())).transpose()

In [15]:
import pickle

In [16]:
pickle.dump(vectorizer, open('vectorizer.pickle','w'))
pickle.dump(cpt, open('cpt.pickle','w'))
pickle.dump(priors, open('priors.pickle','w'))
pickle.dump(label_lookup, open('label_lookup.pickle','w'))