In [1]:
from __future__ import division
from collections import defaultdict
import random
In [2]:
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),
("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),
("data science", 60, 70), ("analytics", 90, 3),
("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),
("actionable insights", 40, 30), ("think out of the box", 45, 10),
("self-starter", 30, 50), ("customer focus", 65, 15),
("thought leadership", 35, 35)]
In [3]:
# get data
def fix_unicode(text):
return text.replace(u"\u2019", "'")
In [4]:
from bs4 import BeautifulSoup
import requests
import re
url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
In [5]:
content = soup.find("div", "a-body")
regex = r"[\w']+|[\.]"
document = []
for paragraph in content('p'):
words = re.findall(regex, fix_unicode(paragraph.text))
document.extend(words)
In [6]:
bigrams = zip(document, document[1:])
transitions = defaultdict(list)
for prev,current in bigrams:
transitions[prev].append(current)
In [7]:
def generate_using_bigrams():
current = "."
result = []
while True:
next_word_candidates = transitions[current] # all bigrams for current
current = random.choice(next_word_candidates) # pick one
result.append(current)
if current == '.':
return " ".join(result)
In [8]:
generate_using_bigrams()
Out[8]:
In [9]:
trigrams = zip(document, document[1:], document[2:])
trigram_transitions = defaultdict(list)
starts = []
for prev, current, next in trigrams:
if prev == ".":
starts.append(current)
trigram_transitions[(prev, current)].append(next)
In [20]:
def generate_using_trigrams():
current = random.choice(starts)
prev = "."
result = [current]
while True:
next_word_candidates = trigram_transitions[(prev, current)]
next_word = random.choice(next_word_candidates)
prev, current = current, next_word
result.append(current)
if current == ".":
return " ".join(result)
In [21]:
generate_using_trigrams()
Out[21]:
In [22]:
grammar = {
"_S" : ["_NP _VP"],
"_NP" : ["_N",
"_A _NP _P _A _N"],
"_VP" : ["_V",
"_V _NP"],
"_N" : ["data science", "Python", "regression"],
"_A" : ["big", "linear", "logistic"],
"_P" : ["about", "near"],
"_V" : ["learns", "trains", "tests", "is"]
}
In [31]:
def is_terminal(token):
return token[0] != "_"
def expand(grammar, tokens):
for i, token in enumerate(tokens):
# skip over terminals
if is_terminal(token):
continue
# if non-terminal choose replacement at random
replacement = random.choice(grammar[token])
if is_terminal(replacement):
tokens[i] = replacement
else:
tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
return expand(grammar, tokens)
return tokens
def generate_sentence(grammar):
return expand(grammar, ["_S"])
In [34]:
generate_sentence(grammar)
Out[34]:
In [37]:
def roll_a_die():
return random.choice([1,2,3,4,5,6])
def direct_sample():
d1 = roll_a_die()
d2 = roll_a_die()
return d1, d1 + d2
def random_y_given_x(x):
return x + roll_a_die()
def random_x_given_y(y):
if y <= 7:
return random.randrange(1,y)
else:
return random.randrange(y - 6, 7)
def gibbs_sample(num_iters = 100):
x, y = 1, 2 # arbitrary
for _ in range(num_iters):
x = random_x_given_y(y)
y = random_y_given_x(x)
return x, y
def compare_distributions(num_samples = 1000):
counts = defaultdict(lambda: [0,0])
for _ in range(num_samples):
counts[gibbs_sample()][0] += 1
counts[gibbs_sample()][1] += 1
return counts
In [38]:
compare_distributions()
Out[38]:
In [39]:
documents = [
["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]
]
In [46]:
# works by working backwards through the cumulative dist
def sample_from(weights):
"""returns i with probability weights[i] / sum(weights)"""
total = sum(weights)
rnd = total*random.random() # uniform between 0 and total
for i, w in enumerate(weights):
rnd -= w # find the smallest i
if rnd <= 0: # s.t. weights[0] + ... + weights[i] >= rnd
return i
def p_topic_given_document(topic, d, alpha = 0.1):
"""fraction of words in document _d_
that are assigned to _topic_ (plus some bit)"""
return ((document_topic_counts[d][topic] + alpha) /
(document_lengths[d] + K*alpha))
def p_word_given_topic(word, topic, beta = 0.1):
"""fraction of words assigned to _topic_
that equal _word_"""
return ((topic_word_counts[topic][word] + beta) /
(topic_counts[topic] + W*beta))
def topic_weight(d, word, k):
"""given a document and a word in that doc,
return the weight for the kth topic"""
return p_word_given_topic(word, k) * p_topic_given_document(k, d)
def choose_new_topic(d, word):
return sample_from([topic_weight(d, word, k)
for k in range(K)])
In [40]:
from collections import Counter
In [181]:
K = 4
# one counter per doc
document_topic_counts = [Counter() for _ in documents]
# one counter per topic
topic_word_counts = [Counter() for _ in range(K)]
# list of numbers, one per topic
topic_counts = [0 for _ in range(K)]
# list of numbers, one per doc
document_lengths = map(len, documents)
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)
D = len(documents)
In [184]:
random.seed(0)
# start with random assignment
document_topics = [[random.randrange(K) for word in document]
for document in documents]
# initialize the counts from the random assignment above
for d in range(D):
for word, topic in zip(documents[d], document_topics[d]):
document_topic_counts[d][topic] += 1
topic_word_counts[topic][word] += 1
topic_counts[topic] += 1
In [185]:
for iteration in range(1000):
for d in range(D):
for i, (word, topic) in enumerate(zip(documents[d],
document_topics[d])):
# remove this word / topic from the counts
# or else the weighting doens't matter
document_topic_counts[d][topic] -= 1
topic_word_counts[topic][word] -= 1
topic_counts[topic] -= 1
document_lengths[d] -= 1
# choose a new topic based on the weights
new_topic = choose_new_topic(d, word)
document_topics[d][i] = new_topic
# and add back to counts
document_topic_counts[d][new_topic] += 1
topic_word_counts[new_topic][word] += 1
topic_counts[new_topic] += 1
document_lengths[d] += 1
In [186]:
for k, word_counts in enumerate(topic_word_counts):
for word, count in word_counts.most_common():
if count > 0:
print k, word, count
In [ ]: