In [9]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
# Data processing
import numpy as np
import scipy as sp
import pandas as pd
import math
from scipy import stats
# System
#from __future__ import print_function
import time
import os
import re
import random
import datetime
import json
# Learning & Extraction
from sklearn.decomposition import NMF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import string
from sklearn.feature_extraction import text
import langid
from topia.termextract import tag
from topia.termextract import extract
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
regex = re.compile('[%s]' % re.escape(string.punctuation))
# Dimensionality Reduction & Distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram
#Plotting
import matplotlib.pyplot as plt
In [ ]:
In [5]:
######
# Utility Functions
######
### Data Preprocessing
def remove_numericals(s):
s = "".join([c for c in s if not c.isdigit()])
return s
def remove_punctuation(s):
s2 = ''
for c in s:
if c not in string.punctuation:
s2 = s2 + c
else:
s2 = s2 + ' '
return s2
def remove_propers_POS(s):
tagged = nltk.pos_tag(s.split()) #use NLTK's part of speech tagger
non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
return ''.join([n + " " for n in non_propernouns])
def remove_html(s):
s = re.sub(r'<.+?>', ' ', s)
return s
### Tokenizers
def tokenizer(s):
return nltk.word_tokenize(s)
stemmer = nltk.stem.porter.PorterStemmer()
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
def stem_tokenizer(s):
return stem_tokens(nltk.word_tokenize(s))
os.chdir(os.getcwd())
def safe_detect(x):
try:
return(langid.classify(x))[0]
except:
print x
return(None)
def preprocessor(s):
s = remove_html(s)
s = remove_propers_POS(s)
s = remove_numericals(s)
s = remove_punctuation(s)
s = s.lower()
#s = spell_check(s)
return s
In [4]:
# #### FOR REMOVING
version = "_07_07_2016"
with open('amis_articles{0}.jsonl'.format(version)) as f:
articles = pd.DataFrame(json.loads(line) for line in f)
articles['date'] = pd.to_datetime(articles['date'])
articles['timestamp'] = articles['date'].apply(lambda d: time.mktime(d.timetuple()))
articles = articles.sort('date', ascending=1)
articles['raw_article'] = articles['article']
sources = list(articles['source'].unique())
In [ ]:
n_topics = 50
n_words = 20
In [16]:
articles.count()
Out[16]:
In [ ]:
In [ ]:
articles['article'] = articles['raw_article'].apply(lambda x: preprocessor(x.decode('utf-8')))
# if s is not np.nan and s != ''
In [ ]:
tf_vectorizer = text.CountVectorizer(max_df=.95, min_df=2, ngram_range=(1, 1),
max_features=n_features, tokenizer=tokenizer,
stop_words=list(text.ENGLISH_STOP_WORDS))
tf = tf_vectorizer.fit_transform(features)
tf_feature_names = tf_vectorizer.get_feature_names()
In [ ]:
tf_freqs = [(word, tf.getcol(idx).sum()) for word, idx in tf_vectorizer.vocabulary_.items()]
tf_freqs = pd.DataFrame(tf_freqs, columns=('word', 'freq'))
In [ ]:
In [ ]:
# Fit the NMF model
reconstruction_error = pd.DataFrame(index=np.arange(0, 5*10*11+1), columns = ('n_components', 'alpha', 'l1_ratio', 'reconstruction_error'))
n = 0
a = 0
l = 0
replace = 0
for c in range(10,60,10) + range(60,200,20) + range(200,400,50):
if os.path.isfile('models/nmf_c'+str(c)+'.pkl') and not replace==1:
nmf_curr = joblib.load('models/nmf_c'+str(c)+'.pkl')
else:
nmf_curr = NMF(n_components=c, random_state=1, alpha=a, l1_ratio=l).fit(tf)
joblib.dump(nmf_curr, 'models/nmf_c'+str(c)+'.pkl')
reconstruction_error.loc[n] = [c, a, l, nmf_curr.reconstruction_err_]
n+=1
joblib.dump(reconstruction_error, 'models/nmf_reconstruction_error.pkl')
reconstruction_error = joblib.load('models/nmf_reconstruction_error.pkl')
In [ ]:
nmf100_topics=list()
nmf100_labels=list()
nmf100_user_topics = nmf100.components_ * tf.transpose()
for topic_idx, topic in enumerate(nmf100.components_):
print("Topic #%d: " % topic_idx + " ".join([tf_feature_names[i] for i in topic.argsort()[:-51:-1]]))
nmf100_topics.append("Topic #%d: " % topic_idx + " ".join([tf_feature_names[i] for i in topic.argsort()[:-11:-1]]))
nmf100_labels.append(" ".join([tf_feature_names[x] for x in topic.argsort()[-3:]]))
nmf100_labels = np.asarray(nmf100_labels)
In [ ]:
saved_words = 100
topic_components = pd.DataFrame(index=np.arange(0, n_topics*saved_words), columns = ('id_topic', 'word_rank', 'text', 'weight'))
for topic_idx, topic in enumerate(nmf100.components_):
sorted_topics = topic.argsort()[:-n_words - 1:-1]
n = 0
for i in sorted_topics[0:saved_words]:
topic_components.loc[n_words*(topic_idx)+n] = [topic_idx, n+1, tf_feature_names[i], topic[i]]
n+=1
In [ ]:
In [ ]:
start_index = 10000
n_index = 10000
user_dist = np.matrix(1 - cosine_similarity(tf[start_index:start_index+n_index])).round(2)
np.fill_diagonal(user_dist, np.inf)
user_dist[np.where(user_dist <.01)] = np.inf
user_min_dist = np.where(user_dist == user_dist.min())[1]
In [ ]:
print('The two closest samples have a cosine similarity of ' + str(user_dist.min()))
print('')
print('About Me for user 1: ' + str(df_about['dim_about_me'][about_index[start_index + user_min_dist[0]]]))
print('')
print('About Me for user 2: ' + str(df_about['dim_about_me'][about_index[start_index + user_min_dist[1]]]))
In [ ]:
topic_dist = np.matrix(1 - cosine_similarity(nmf100.components_))
topic_min_dist = np.where(topic_dist == topic_dist[np.where(topic_dist > 0.01)].min())[0]
In [ ]:
print('The two closest samples have a cosine similarity of ' + str(topic_dist[np.where(topic_dist >.6)].min()))
print('')
print('About Me for topic 1: ' + str(nmf100_topics[topic_min_dist[0]]))
print('')
print('About Me for topic 2: ' + str(nmf100_topics[topic_min_dist[1]]))
In [ ]:
In [ ]:
linkage_matrix = ward(topic_dist) # define the linkage_matrix using ward clustering pre-computed distances
fig, ax = plt.subplots(figsize=(10, 60)) # set size
ax = dendrogram(linkage_matrix, orientation="left", labels=np.array(nmf100_labels), leaf_font_size=16);
plt.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
plt.tight_layout()
# uncomment below to save figure
plt.savefig('topic_heirarchy.png', dpi=200) #save figure as ward_clusters
In [ ]:
In [ ]: