In [9]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Data processing
import numpy as np
import scipy as sp
import pandas as pd
import math
from scipy import stats



# System 
#from __future__ import print_function
import time
import os
import re
import random
import datetime
import json


# Learning & Extraction
from sklearn.decomposition import NMF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import string

from sklearn.feature_extraction import text
import langid
from topia.termextract import tag
from topia.termextract import extract

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')

regex = re.compile('[%s]' % re.escape(string.punctuation))


# Dimensionality Reduction & Distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram

#Plotting
import matplotlib.pyplot as plt


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[nltk_data] Downloading package punkt to /Users/mrpozzi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mrpozzi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.

In [ ]:


In [5]:
######
# Utility Functions
######
### Data Preprocessing
def remove_numericals(s):
    s = "".join([c for c in s if not c.isdigit()])
    return s

def remove_punctuation(s):
    s2 = ''
    for c in s:
        if c not in string.punctuation:
            s2 = s2 + c
        else:
            s2 = s2 + ' '
    return s2

def remove_propers_POS(s):
    tagged = nltk.pos_tag(s.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return ''.join([n + " " for n in non_propernouns])

def remove_html(s):
    s = re.sub(r'<.+?>', ' ', s)
    return s


### Tokenizers

def tokenizer(s):
    return nltk.word_tokenize(s)

stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def stem_tokenizer(s):
    return stem_tokens(nltk.word_tokenize(s))

os.chdir(os.getcwd())

def safe_detect(x):
    try:
        return(langid.classify(x))[0]
    except:
        print x
        return(None)

def preprocessor(s):
    s = remove_html(s)
    s = remove_propers_POS(s)
    s = remove_numericals(s)
    s = remove_punctuation(s)
    s = s.lower()
    #s = spell_check(s)
    return s

In [4]:
# #### FOR REMOVING
version = "_07_07_2016"
with open('amis_articles{0}.jsonl'.format(version)) as f:
    articles = pd.DataFrame(json.loads(line) for line in f)

articles['date'] = pd.to_datetime(articles['date'])
articles['timestamp'] = articles['date'].apply(lambda d: time.mktime(d.timetuple()))
articles = articles.sort('date', ascending=1)

articles['raw_article'] = articles['article'] 

sources = list(articles['source'].unique())

In [ ]:
n_topics = 50
n_words = 20

In [16]:
articles.count()


Out[16]:
article        126602
date           126602
link           126602
source         126602
title          126602
timestamp      126602
raw_article    126602
dtype: int64

In [ ]:


In [ ]:
articles['article'] = articles['raw_article'].apply(lambda x: preprocessor(x.decode('utf-8')))
# if s is not np.nan and s != ''

In [ ]:
tf_vectorizer = text.CountVectorizer(max_df=.95, min_df=2, ngram_range=(1, 1),
                                        max_features=n_features, tokenizer=tokenizer,
                                        stop_words=list(text.ENGLISH_STOP_WORDS))
tf = tf_vectorizer.fit_transform(features)
tf_feature_names = tf_vectorizer.get_feature_names()

In [ ]:
tf_freqs = [(word, tf.getcol(idx).sum()) for word, idx in tf_vectorizer.vocabulary_.items()]
tf_freqs = pd.DataFrame(tf_freqs, columns=('word', 'freq'))

In [ ]:

Non-negative Matrix Factorization


In [ ]:
# Fit the NMF model
reconstruction_error = pd.DataFrame(index=np.arange(0, 5*10*11+1), columns = ('n_components', 'alpha', 'l1_ratio', 'reconstruction_error'))
n = 0
a = 0
l = 0
replace = 0

for c in range(10,60,10) + range(60,200,20) + range(200,400,50):
    if os.path.isfile('models/nmf_c'+str(c)+'.pkl') and not replace==1:
        nmf_curr = joblib.load('models/nmf_c'+str(c)+'.pkl')
    else:
        nmf_curr = NMF(n_components=c, random_state=1, alpha=a, l1_ratio=l).fit(tf)
        joblib.dump(nmf_curr, 'models/nmf_c'+str(c)+'.pkl')
    reconstruction_error.loc[n] = [c, a, l, nmf_curr.reconstruction_err_]
    n+=1
             
joblib.dump(reconstruction_error, 'models/nmf_reconstruction_error.pkl')
reconstruction_error = joblib.load('models/nmf_reconstruction_error.pkl')

In [ ]:
nmf100_topics=list()
nmf100_labels=list()

nmf100_user_topics = nmf100.components_ * tf.transpose()

for topic_idx, topic in enumerate(nmf100.components_):
    print("Topic #%d: " % topic_idx + " ".join([tf_feature_names[i] for i in topic.argsort()[:-51:-1]]))
    nmf100_topics.append("Topic #%d: " % topic_idx + " ".join([tf_feature_names[i] for i in topic.argsort()[:-11:-1]]))
    nmf100_labels.append(" ".join([tf_feature_names[x] for x in topic.argsort()[-3:]]))
    
nmf100_labels = np.asarray(nmf100_labels)

In [ ]:
saved_words = 100
topic_components = pd.DataFrame(index=np.arange(0, n_topics*saved_words), columns = ('id_topic', 'word_rank', 'text', 'weight'))

for topic_idx, topic in enumerate(nmf100.components_):
  sorted_topics = topic.argsort()[:-n_words - 1:-1] 
  n = 0
  for i in sorted_topics[0:saved_words]:
    topic_components.loc[n_words*(topic_idx)+n]  = [topic_idx, n+1, tf_feature_names[i], topic[i]]
    n+=1

In [ ]:

Similar Articles


In [ ]:
start_index = 10000
n_index = 10000
user_dist = np.matrix(1 - cosine_similarity(tf[start_index:start_index+n_index])).round(2)
np.fill_diagonal(user_dist, np.inf)
user_dist[np.where(user_dist <.01)] = np.inf
user_min_dist = np.where(user_dist == user_dist.min())[1]

In [ ]:
print('The two closest samples have a cosine similarity of ' + str(user_dist.min()))
print('')
print('About Me for user 1: ' + str(df_about['dim_about_me'][about_index[start_index + user_min_dist[0]]]))
print('')
print('About Me for user 2: ' + str(df_about['dim_about_me'][about_index[start_index + user_min_dist[1]]]))

Similar Topics


In [ ]:
topic_dist = np.matrix(1 - cosine_similarity(nmf100.components_))
topic_min_dist = np.where(topic_dist == topic_dist[np.where(topic_dist > 0.01)].min())[0]

In [ ]:
print('The two closest samples have a cosine similarity of ' + str(topic_dist[np.where(topic_dist >.6)].min()))
print('')
print('About Me for topic 1: ' + str(nmf100_topics[topic_min_dist[0]]))
print('')
print('About Me for topic 2: ' + str(nmf100_topics[topic_min_dist[1]]))

In [ ]:

Topic Hierarchy


In [ ]:
linkage_matrix = ward(topic_dist) # define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(10, 60)) # set size
ax = dendrogram(linkage_matrix, orientation="left", labels=np.array(nmf100_labels), leaf_font_size=16);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout()

# uncomment below to save figure
plt.savefig('topic_heirarchy.png', dpi=200) #save figure as ward_clusters

In [ ]:


In [ ]: