In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [196]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim
from utils.common.db_utils import read_all_results
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = stopwords.words('english')
# Create p_stemmer of class PorterStemmer
p_stemmer = SnowballStemmer('english',ignore_stopwords=True)
In [4]:
config = dict(parameters={},DEFAULT={})
config["parameters"]["input_db"] = "tier-1"
config["DEFAULT"]["tier-1"] = "/Users/jklinger/Nesta/nesta_dataflow/db_config/tier-1.cnf"
config["parameters"]["input_table"] = "course_descriptions"
results = read_all_results(config,"input_db","input_table")
doc_set = [r[1] for r in results]
In [379]:
def generate_corpus(doc_set,extra_stops=[]):
# list for tokenized documents in loop
texts = []
# loop through document list
for doc in doc_set:
if doc is None:
continue
# clean and tokenize document string
raw = doc.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
tokens = [t for t in tokens if t not in en_stop]
# stem tokens
tokens = [p_stemmer.stem(s) for s in tokens]
before = len(tokens)
tokens = [t for t in tokens if t not in extra_stops]
after = len(tokens)
diff = before - after
if diff > 0:
print(diff)
# add tokens to list
texts.append(tokens)
print("Got",len(texts))
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
return texts,corpus,dictionary
In [211]:
# generate LDA model
texts,corpus,dictionary = generate_corpus(doc_set)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=1000)
In [219]:
ldamodel.print_topics(num_topics=30)
Out[219]:
In [1]:
model = gensim.models.Word2Vec(texts, size=500, window=5, min_count=5, workers=4, sample=1e-3)
model.wv.init_sims()
wv = model.wv.syn0
word_mapping = {model.wv.index2word[i]:v for i,v in enumerate(wv)}
In [294]:
import numpy as np
from collections import Counter
class WordPotential:
def __init__(self):
self.topics = []
self.weights = []
self.distances = []
def append(self,distance,weight,topic):
self.topics.append(topic)
self.distances.append(distance)
self.weights.append(weight)
def forces(self):
for d,w in self.items():
yield w*d
def energies(self):
for d,w in self.items():
yield 0.5*w*d*d
def items(self):
return zip(self.distances,self.weights)
topic_positions = []
all_forces = []
word_potentials = {}
x = []
for itopic,topic in enumerate(ldamodel.get_topics()):
weighted_positions = []
for idx,weight in enumerate(topic):
word = dictionary.get(idx)
if word in word_mapping:
pos = word_mapping[word]
weighted_positions.append(weight * pos)
elif weight > 0.01:
print(word,weight)
topic_pos = np.average(weighted_positions,axis=0)
#print("-------------")
#print(topic_pos)
topic_positions.append(topic_pos)
# Plot get collection of weight*distance for this topic
forces = []
for idx,weight in enumerate(topic):
word = dictionary.get(idx)
if not word in word_mapping:
continue
pos = word_mapping[word]
distance = np.linalg.norm(topic_pos - pos)
x.append(distance)
#print(distance)
#print(pos)
#if np.abs(distance - 1.) > 1e-5:
# print(type(distance),distance,distance-1.0)
#if weight > 1e-4:
force = distance*weight
forces.append(force)
if word not in word_potentials:
word_potentials[word] = WordPotential()
word_potentials[word].append(distance,weight,itopic)
#break
all_forces.append(forces)
In [295]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import mpld3
word_mean_potential = [max(wp.energies()) for word,wp in word_potentials.items()]
word_min_distance = [min(wp.distances) for word,wp in word_potentials.items()]
n = [counter[word] for word,wp in word_potentials.items()]
#n = [c if c < 100 else 100 for c in n]
fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(word_mean_potential,word_min_distance,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Word's maximum energy",fontsize=14)
ax.set_ylabel("Closest distance to a topic",fontsize=14)
#ax.set_ylim(0.,0.0004)
#ax.set_xlim(0.0,0.05)
fig.colorbar(scatter)
labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)
#word_energy_total,np.arange(0,0.12,0.001))
Out[295]:
In [347]:
word_mean_potential = [np.std(list(wp.energies())) for word,wp in word_potentials.items()]
word_max_potential = [np.sum(list(wp.energies())) for word,wp in word_potentials.items()]
n = [counter[word] for word,wp in word_potentials.items()]
fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(word_mean_potential,word_max_potential,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Word's median energy (Genericity)",fontsize=14)
ax.set_ylabel("Word's maximum force (Ambiguity) ",fontsize=14)
#ax.set_ylim(0.,1.5)
#ax.set_xlim(0.0,0.018)
fig.colorbar(scatter)
labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)
Out[347]:
In [360]:
sum_e = [np.std(list(wp.energies())/np.mean(list(wp.energies()))) for word,wp in word_potentials.items()]
med_e = [np.sqrt(np.sum(np.square(wp.weights),axis=0)) for word,wp in word_potentials.items()]
#sum_e = [x for _,x in sorted(zip(n,sum_e))]
#med_e = [x for _,x in sorted(zip(n,med_e))]
fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(sum_e,med_e,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Std/Mean word energy (Specificity)",fontsize=14)
ax.set_ylabel("Sqrt sum of word weight^2 (Topic prevalance) ",fontsize=14)
#ax.set_ylim(0.,0.002)
#ax.set_xlim(0.0,0.018)
fig.colorbar(scatter)
labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)
Out[360]:
In [380]:
extra_stops = []
for word,wp in word_potentials.items():
specificity = np.std(list(wp.energies())/np.mean(list(wp.energies())))
if specificity < 1.5: # and max_force < 0.35:
extra_stops.append(word)
print(extra_stops)
text,corpus,dictionary = generate_corpus(doc_set,extra_stops=extra_stops)
In [381]:
# generate LDA model
text,corpus,dictionary = generate_corpus(doc_set,extra_stops=extra_stops)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=1000)
In [382]:
ldamodel.print_topics(num_topics=30)
Out[382]:
In [ ]: