notebook.community

Edit and run



In [9]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Data processing
import numpy as np
import scipy as sp
import pandas as pd
import math
from scipy import stats



# System 
#from __future__ import print_function
import time
import os
import re
import random
import datetime
import json


# Learning & Extraction
from sklearn.decomposition import NMF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import string

from sklearn.feature_extraction import text
import langid
from topia.termextract import tag
from topia.termextract import extract

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')

regex = re.compile('[%s]' % re.escape(string.punctuation))


# Dimensionality Reduction & Distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram

#Plotting
import matplotlib.pyplot as plt









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[nltk_data] Downloading package punkt to /Users/mrpozzi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.



In [ ]:



In [5]:

    
######
# Utility Functions
######
### Data Preprocessing
def remove_numericals(s):
    s = "".join([c for c in s if not c.isdigit()])
    return s

def remove_punctuation(s):
    s2 = ''
    for c in s:
        if c not in string.punctuation:
            s2 = s2 + c
        else:
            s2 = s2 + ' '
    return s2

def remove_propers_POS(s):
    tagged = nltk.pos_tag(s.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return ''.join([n + " " for n in non_propernouns])

def remove_html(s):
    s = re.sub(r'<.+?>', ' ', s)
    return s


### Tokenizers

def tokenizer(s):
    return nltk.word_tokenize(s)

stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def stem_tokenizer(s):
    return stem_tokens(nltk.word_tokenize(s))

os.chdir(os.getcwd())

def safe_detect(x):
    try:
        return(langid.classify(x))[0]
    except:
        print x
        return(None)

def preprocessor(s):
    s = remove_html(s)
    s = remove_propers_POS(s)
    s = remove_numericals(s)
    s = remove_punctuation(s)
    s = s.lower()
    #s = spell_check(s)
    return s



In [4]:

    
# #### FOR REMOVING
version = "_07_07_2016"
with open('amis_articles{0}.jsonl'.format(version)) as f:
    articles = pd.DataFrame(json.loads(line) for line in f)

articles['date'] = pd.to_datetime(articles['date'])
articles['timestamp'] = articles['date'].apply(lambda d: time.mktime(d.timetuple()))
articles = articles.sort('date', ascending=1)

articles['raw_article'] = articles['article'] 

sources = list(articles['source'].unique())



In [ ]:

    
n_topics = 50
n_words = 20



In [16]:

    
articles.count()









    Out[16]:





article        126602
date           126602
link           126602
source         126602
title          126602
timestamp      126602
raw_article    126602
dtype: int64



In [ ]:



In [ ]:

    
articles['article'] = articles['raw_article'].apply(lambda x: preprocessor(x.decode('utf-8')))
# if s is not np.nan and s != ''



In [ ]:

    
tf_vectorizer = text.CountVectorizer(max_df=.95, min_df=2, ngram_range=(1, 1),
                                        max_features=n_features, tokenizer=tokenizer,
                                        stop_words=list(text.ENGLISH_STOP_WORDS))
tf = tf_vectorizer.fit_transform(features)
tf_feature_names = tf_vectorizer.get_feature_names()



In [ ]:

    
tf_freqs = [(word, tf.getcol(idx).sum()) for word, idx in tf_vectorizer.vocabulary_.items()]
tf_freqs = pd.DataFrame(tf_freqs, columns=('word', 'freq'))



In [ ]:

Non-negative Matrix Factorization



In [ ]:

    
# Fit the NMF model
reconstruction_error = pd.DataFrame(index=np.arange(0, 5*10*11+1), columns = ('n_components', 'alpha', 'l1_ratio', 'reconstruction_error'))
n = 0
a = 0
l = 0
replace = 0

for c in range(10,60,10) + range(60,200,20) + range(200,400,50):
    if os.path.isfile('models/nmf_c'+str(c)+'.pkl') and not replace==1:
        nmf_curr = joblib.load('models/nmf_c'+str(c)+'.pkl')
    else:
        nmf_curr = NMF(n_components=c, random_state=1, alpha=a, l1_ratio=l).fit(tf)
        joblib.dump(nmf_curr, 'models/nmf_c'+str(c)+'.pkl')
    reconstruction_error.loc[n] = [c, a, l, nmf_curr.reconstruction_err_]
    n+=1
             
joblib.dump(reconstruction_error, 'models/nmf_reconstruction_error.pkl')
reconstruction_error = joblib.load('models/nmf_reconstruction_error.pkl')



In [ ]:

    
nmf100_topics=list()
nmf100_labels=list()

nmf100_user_topics = nmf100.components_ * tf.transpose()

for topic_idx, topic in enumerate(nmf100.components_):
    print("Topic #%d: " % topic_idx + " ".join([tf_feature_names[i] for i in topic.argsort()[:-51:-1]]))
    nmf100_topics.append("Topic #%d: " % topic_idx + " ".join([tf_feature_names[i] for i in topic.argsort()[:-11:-1]]))
    nmf100_labels.append(" ".join([tf_feature_names[x] for x in topic.argsort()[-3:]]))
    
nmf100_labels = np.asarray(nmf100_labels)



In [ ]:

    
saved_words = 100
topic_components = pd.DataFrame(index=np.arange(0, n_topics*saved_words), columns = ('id_topic', 'word_rank', 'text', 'weight'))

for topic_idx, topic in enumerate(nmf100.components_):
  sorted_topics = topic.argsort()[:-n_words - 1:-1] 
  n = 0
  for i in sorted_topics[0:saved_words]:
    topic_components.loc[n_words*(topic_idx)+n]  = [topic_idx, n+1, tf_feature_names[i], topic[i]]
    n+=1



In [ ]:

Topic Hierarchy



In [ ]:

    
linkage_matrix = ward(topic_dist) # define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(10, 60)) # set size
ax = dendrogram(linkage_matrix, orientation="left", labels=np.array(nmf100_labels), leaf_font_size=16);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout()

# uncomment below to save figure
plt.savefig('topic_heirarchy.png', dpi=200) #save figure as ward_clusters



In [ ]:



In [ ]:

Non-negative Matrix Factorization

Similar Articles

Similar Topics

Topic Hierarchy