notebook.community

Edit and run



In [1]:

    
%pylab inline
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
import pickle









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
articles = json.load(open('articles_html1000.json'))
articles = [article for article in articles if article['content'] != []]



In [3]:

    
article_texts = [' '.join(article['content']) for article in articles]



In [4]:

    
sections = [article['section_name'] for article in articles]
urls = [article['web_url'] for article in articles]



In [5]:

    
vectorizer = TfidfVectorizer(min_df=1, stop_words = 'english')



In [6]:

    
article_word = vectorizer.fit_transform(article_texts)
article_word









    Out[6]:





<985x37487 sparse matrix of type '<type 'numpy.float64'>'
	with 231502 stored elements in Compressed Sparse Row format>



In [7]:

    
label_article = sparse.csr_matrix(np.zeros((len(set(sections)),len(articles))))
label_article









    Out[7]:





<26x985 sparse matrix of type '<type 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>



In [8]:

    
label_lookup = dict(zip(set(sections), range(len(set(sections)))))



In [9]:

    
article_lookup = dict(zip(urls, range(len(urls))))



In [10]:

    
word_lookup = dict(zip(vectorizer.get_feature_names(), range(len(vectorizer.get_feature_names()))))
len(word_lookup)









    Out[10]:





37487



In [11]:

    
for article in articles:
    label_article[label_lookup[article['section_name']], article_lookup[article['web_url']]] = 1
label_article









    



/usr/lib/python2.7/dist-packages/scipy/sparse/compressed.py:486: SparseEfficiencyWarning: changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  SparseEfficiencyWarning)






    Out[11]:





<26x985 sparse matrix of type '<type 'numpy.float64'>'
	with 985 stored elements in Compressed Sparse Row format>



In [12]:

    
cpt = label_article*article_word
cpt









    Out[12]:





<26x37487 sparse matrix of type '<type 'numpy.float64'>'
	with 106610 stored elements in Compressed Sparse Row format>



In [13]:

    
ij = zip(cpt.nonzero()[0], cpt.nonzero()[1])
norms = cpt.sum(axis=1)
for i,j in ij:
    cpt[i,j] = log(cpt[i,j]/norms[i][0,0])



In [14]:

    
priors = zeros((len(label_lookup)))

for section in sections:
    priors[label_lookup[section]] += 1
    
priors = matrix(log(priors/priors.sum())).transpose()



In [15]:

    
import pickle



In [16]:

    
pickle.dump(vectorizer, open('vectorizer.pickle','w'))
pickle.dump(cpt, open('cpt.pickle','w'))
pickle.dump(priors, open('priors.pickle','w'))
pickle.dump(label_lookup, open('label_lookup.pickle','w'))