Laplacian Smoothing


In [1]:
%pdb


Automatic pdb calling has been turned ON

In [2]:
from IPython.display import display
import pprint

import bow
import numpy as np
import string

In [3]:
def laplacian_smoothing(
    v1: int, v2: int, k: int, k_class: int
):
    return (v1+k)/(v2+k*k_class)

In [4]:
def maximum_likelihood(
    v1: int, v2: int, **kwargs: list
):
    return v1/v2

In [5]:
def tokenize(text: str):
    return [
        word.strip(string.punctuation) for word in text.upper().split()
    ]

In [6]:
def p_query(
    query: list, 
    domain: bow.BagOfWords,
    bag: bow.BagOfWords,
    p_method: object
):
    return [
        p_method(
            v1=domain[word] if word in domain else 0, 
            v2=sum(domain.values()), 
            k=1,
            k_class=len(bag)  # movie and song
        ) for word in query
    ]

Test - Query Perfect Storm


In [7]:
s_movie = """
A PERFECT WORLD
MY PERFECT WOMAN
PRETTY WOMAN
"""

s_song = """
A PERFECT DAY
ELECTRIC STORM
ANOTHER RAIN DAY
"""

queries = [
    tokenize('Perfect Storm')
]

In [8]:
def test_perfect_storm(
    movie: str, song: str, p_method: object, queries: list
):
    movie = bow.BagOfWords(tokenize(s_movie))
    song = bow.BagOfWords(tokenize(s_song))
    bag = movie + song

    print('\nDICTIONARIES')
    print('\nmovies:')
    pprint.pprint(str(movie))
    print('\nsongs:')
    pprint.pprint(str(song))
    print('\nall:')
    pprint.pprint(str(bag))

    print('\nDICTIONARIES')
    n_movie = s_movie.count('\n') - 1
    n_song = s_song.count('\n') - 1
    print('movie entries:', n_movie, '| song entries:',  n_song)
    
    p_movie = p_method(
        v1=n_movie, 
        v2=n_movie+n_song, 
        k=1,
        k_class=2  # movie and song
    )

    p_song = p_method(
        v1=n_song, 
        v2=n_movie+n_song, 
        k=1,
        k_class=2  # movie and song
    )

    print('P(MOVIE): ', p_movie, '| P(SONG):', p_song)

    print('query:', queries)

    for query in queries:
        print('\nQuery: ', query)

        p_q_movie = np.prod(p_query(query, movie, bag, p_method))
        p_q_song = np.prod(p_query(query, song, bag, p_method))

        print('P(%s|MOVIE)' % ','.join(query), p_q_movie)
        print('P(%s|SONG)' % ','.join(query), p_q_song)

        _total = p_q_movie*p_movie + p_q_song*p_song
        p_movie_q = (p_q_movie*p_movie) / _total
        p_song_q = (p_q_song*p_song) / _total

        print('P(MOVIE|%s):' % ','.join(query), p_movie_q)
        print('P(SONG|%s):' % ','.join(query), p_song_q)

In [9]:
test_perfect_storm(s_movie, s_song, laplacian_smoothing, queries)


DICTIONARIES

movies:
"{'MY': 1, 'PRETTY': 1, 'A': 1, 'WORLD': 1, 'WOMAN': 2, 'PERFECT': 2}"

songs:
("{'RAIN': 1, 'A': 1, 'ELECTRIC': 1, 'STORM': 1, 'DAY': 2, 'ANOTHER': 1, "
 "'PERFECT': 1}")

all:
("{'WORLD': 1, 'WOMAN': 2, 'DAY': 2, 'ANOTHER': 1, 'A': 2, 'MY': 1, "
 "'ELECTRIC': 1, 'PRETTY': 1, 'RAIN': 1, 'PERFECT': 3, 'STORM': 1}")

DICTIONARIES
movie entries: 3 | song entries: 3
P(MOVIE):  0.5 | P(SONG): 0.5
query: [['PERFECT', 'STORM']]

Query:  ['PERFECT', 'STORM']
P(PERFECT,STORM|MOVIE) 0.00831024930748
P(PERFECT,STORM|SONG) 0.01108033241
P(MOVIE|PERFECT,STORM): 0.428571428571
P(SONG|PERFECT,STORM): 0.571428571429

In [10]:
test_perfect_storm(s_movie, s_song, maximum_likelihood, queries)


DICTIONARIES

movies:
"{'MY': 1, 'PRETTY': 1, 'A': 1, 'WORLD': 1, 'WOMAN': 2, 'PERFECT': 2}"

songs:
("{'RAIN': 1, 'A': 1, 'ELECTRIC': 1, 'STORM': 1, 'DAY': 2, 'ANOTHER': 1, "
 "'PERFECT': 1}")

all:
("{'WORLD': 1, 'WOMAN': 2, 'DAY': 2, 'ANOTHER': 1, 'A': 2, 'MY': 1, "
 "'ELECTRIC': 1, 'PRETTY': 1, 'RAIN': 1, 'PERFECT': 3, 'STORM': 1}")

DICTIONARIES
movie entries: 3 | song entries: 3
P(MOVIE):  0.5 | P(SONG): 0.5
query: [['PERFECT', 'STORM']]

Query:  ['PERFECT', 'STORM']
P(PERFECT,STORM|MOVIE) 0.0
P(PERFECT,STORM|SONG) 0.015625
P(MOVIE|PERFECT,STORM): 0.0
P(SONG|PERFECT,STORM): 1.0