In [1]:
from __future__ import division
from collections import defaultdict
import random

In [2]:
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),  
        ("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),   
        ("data science", 60, 70), ("analytics", 90, 3),      
        ("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),   
        ("actionable insights", 40, 30), ("think out of the box", 45, 10),   
        ("self-starter", 30, 50), ("customer focus", 65, 15),     
        ("thought leadership", 35, 35)]

In [3]:
# get data

def fix_unicode(text):
    return text.replace(u"\u2019", "'")

In [4]:
from bs4 import BeautifulSoup
import requests
import re

url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

In [5]:
content = soup.find("div", "a-body")
regex = r"[\w']+|[\.]"

document = []

for paragraph in content('p'):
    words = re.findall(regex, fix_unicode(paragraph.text))
    document.extend(words)

In [6]:
bigrams = zip(document, document[1:])
transitions = defaultdict(list)
for prev,current in bigrams:
    transitions[prev].append(current)

In [7]:
def generate_using_bigrams():
    current = "."
    result = []
    while True:
        next_word_candidates = transitions[current] # all bigrams for current
        current = random.choice(next_word_candidates) # pick one
        result.append(current)
        if current == '.':
            return " ".join(result)

In [8]:
generate_using_bigrams()


Out[8]:
u"There's a problem that you'd otherwise have to the next sexy job data platform though not be data to Mike Driscoll dataspora statistics is a solution ."

In [9]:
trigrams = zip(document, document[1:], document[2:])
trigram_transitions = defaultdict(list)
starts = []

for prev, current, next in trigrams:
    if prev == ".":
        starts.append(current)
        
    trigram_transitions[(prev, current)].append(next)

In [20]:
def generate_using_trigrams():
    current = random.choice(starts)
    prev = "."
    result = [current]
    
    while True:
        next_word_candidates = trigram_transitions[(prev, current)]
        next_word = random.choice(next_word_candidates)
        
        prev, current = current, next_word
        result.append(current)
        
        if current == ".":
            return " ".join(result)

In [21]:
generate_using_trigrams()


Out[21]:
u"The part of the earlier data products that are easily described you can do something with it and that's where Moore's Law applied to data conditioning to drawing conclusions ."

Grammar


In [22]:
grammar = {
    "_S" : ["_NP _VP"],
    "_NP" : ["_N",
            "_A _NP _P _A _N"],
    "_VP" : ["_V",
            "_V _NP"],
    "_N" : ["data science", "Python", "regression"],
    "_A" : ["big", "linear", "logistic"],
    "_P" : ["about", "near"],
    "_V" : ["learns", "trains", "tests", "is"]
}

In [31]:
def is_terminal(token):
    return token[0] != "_"

def expand(grammar, tokens):
    for i, token in enumerate(tokens):
        
        # skip over terminals
        if is_terminal(token):
            continue
            
        # if non-terminal choose replacement at random
        replacement = random.choice(grammar[token])
        
        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
            
        return expand(grammar, tokens)
    
    return tokens

def generate_sentence(grammar):
    return expand(grammar, ["_S"])

In [34]:
generate_sentence(grammar)


Out[34]:
['linear',
 'linear',
 'Python',
 'near',
 'logistic',
 'regression',
 'about',
 'big',
 'regression',
 'trains']

Gibbs Sampling


In [37]:
def roll_a_die():
    return random.choice([1,2,3,4,5,6])

def direct_sample():
    d1 = roll_a_die()
    d2 = roll_a_die()
    return d1, d1 + d2

def random_y_given_x(x):
    return x + roll_a_die()

def random_x_given_y(y):
    if y <= 7:
        return random.randrange(1,y)
    else:
        return random.randrange(y - 6, 7)
    
def gibbs_sample(num_iters = 100):
    x, y = 1, 2 # arbitrary
    for _ in range(num_iters):
        x = random_x_given_y(y)
        y = random_y_given_x(x)
    return x, y

def compare_distributions(num_samples = 1000):
    counts = defaultdict(lambda: [0,0])
    for _ in range(num_samples):
        counts[gibbs_sample()][0] += 1
        counts[gibbs_sample()][1] += 1
    return counts

In [38]:
compare_distributions()


Out[38]:
defaultdict(<function <lambda> at 0x105125ed8>, {(5, 9): [31, 38], (6, 9): [19, 19], (1, 3): [27, 34], (4, 8): [30, 31], (5, 6): [28, 19], (2, 8): [23, 30], (4, 7): [23, 29], (1, 6): [30, 30], (3, 7): [25, 35], (2, 5): [34, 26], (5, 8): [25, 20], (1, 2): [28, 29], (4, 9): [28, 28], (6, 10): [33, 27], (1, 5): [28, 37], (3, 6): [36, 23], (4, 5): [27, 23], (4, 10): [29, 30], (2, 6): [26, 26], (5, 11): [21, 21], (6, 11): [27, 26], (1, 4): [29, 35], (6, 7): [29, 30], (3, 9): [29, 29], (2, 3): [32, 28], (6, 8): [21, 21], (6, 12): [30, 28], (3, 5): [21, 25], (2, 7): [27, 24], (5, 10): [33, 20], (4, 6): [25, 27], (5, 7): [29, 32], (3, 8): [33, 39], (1, 7): [37, 26], (3, 4): [28, 33], (2, 4): [19, 22]})

LDA


In [39]:
documents = [
        ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
        ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
        ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
        ["R", "Python", "statistics", "regression", "probability"],
        ["machine learning", "regression", "decision trees", "libsvm"],
        ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
        ["statistics", "probability", "mathematics", "theory"],
        ["machine learning", "scikit-learn", "Mahout", "neural networks"],
        ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
        ["Hadoop", "Java", "MapReduce", "Big Data"],
        ["statistics", "R", "statsmodels"],
        ["C++", "deep learning", "artificial intelligence", "probability"],
        ["pandas", "R", "Python"],
        ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
        ["libsvm", "regression", "support vector machines"]
]

In [46]:
# works by working backwards through the cumulative dist
def sample_from(weights):
    """returns i with probability weights[i] / sum(weights)"""
    total = sum(weights)
    rnd = total*random.random() # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w # find the smallest i
        if rnd <= 0: # s.t. weights[0] + ... + weights[i] >= rnd
            return i
        
def p_topic_given_document(topic, d, alpha = 0.1):
    """fraction of words in document _d_
    that are assigned to _topic_ (plus some bit)"""
    
    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K*alpha))

def p_word_given_topic(word, topic, beta = 0.1):
    """fraction of words assigned to _topic_
    that equal _word_"""
    
    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W*beta))

def topic_weight(d, word, k):
    """given a document and a word in that doc,
    return the weight for the kth topic"""
    
    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

In [40]:
from collections import Counter

In [181]:
K = 4
    
# one counter per doc
document_topic_counts = [Counter() for _ in documents]

# one counter per topic
topic_word_counts = [Counter() for _ in range(K)]

# list of numbers, one per topic
topic_counts = [0 for _ in range(K)]

# list of numbers, one per doc
document_lengths = map(len, documents)

distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

D = len(documents)

In [184]:
random.seed(0)
# start with random assignment
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

# initialize the counts from the random assignment above
for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

In [185]:
for iteration in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):
            
            # remove this word / topic from the counts
            # or else the weighting doens't matter
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1
            
            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic
            
            # and add back to counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [186]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0:
            print k, word, count


0 neural networks 3
0 artificial intelligence 3
0 machine learning 3
0 Hadoop 2
0 Mahout 2
0 deep learning 2
0 mathematics 2
0 C++ 2
0 decision trees 2
0 Big Data 2
0 probability 1
0 regression 1
1 R 5
1 regression 4
1 libsvm 4
1 Big Data 4
1 NoSQL 2
1 Storm 2
1 deep learning 2
1 Spark 2
1 numpy 2
1 C++ 2
1 HBase 2
1 MongoDB 2
1 scikit-learn 2
1 Haskell 2
1 artificial intelligence 1
1 machine learning 1
1 support vector machines 1
2 Postgres 4
2 probability 3
2 Java 2
2 Hadoop 2
2 pandas 2
2 theory 2
2 MongoDB 2
2 MapReduce 2
2 MySQL 1
2 Python 1
2 R 1
2 databases 1
2 support vector machines 1
2 Cassandra 1
3 Python 7
3 statistics 6
3 Java 4
3 HBase 4
3 statsmodels 4
3 Cassandra 3
3 probability 2
3 R 2
3 programming languages 2
3 pandas 2
3 scipy 2
3 scikit-learn 2
3 neural networks 1
3 MySQL 1
3 databases 1

In [ ]: