In [1]:
import numpy as np
import pandas as pd
from itertools import product
from functools import reduce

In [2]:
variables = {
    'I': ['h', 'vh'],
    'G': ['A', 'B'],
    'GPA': ['[0,3)', '[3,3.5)', '[3.5,4.0]']
}

In [3]:
df = pd.DataFrame([
    dict(zip(variables.keys(), values))
    for values in product(*variables.values())
])
df['prob'] = np.random.randint(1, 10, len(df))
df['prob'] /= df['prob'].sum()
df


Out[3]:
G GPA I prob
0 A [0,3) h 0.035714
1 B [0,3) h 0.125000
2 A [3,3.5) h 0.071429
3 B [3,3.5) h 0.053571
4 A [3.5,4.0] h 0.071429
5 B [3.5,4.0] h 0.017857
6 A [0,3) vh 0.125000
7 B [0,3) vh 0.160714
8 A [3,3.5) vh 0.160714
9 B [3,3.5) vh 0.125000
10 A [3.5,4.0] vh 0.017857
11 B [3.5,4.0] vh 0.035714

In [4]:
def joint_probability(df, y, xs):
    df_y = df[df['G'] == y]
    p_y = df_y['prob'].sum()
        
    df_ = df_y
    l = []
    for col in xs:
        df_ = df_[df_[col] == xs[col]]
        p = df_['prob'].sum()
        l.append(p)
    
    return p_y * reduce(lambda x, y: x * y, l)

def joint_probability_with_cond_ind(df, y, xs):
    df_y = df[df['G'] == y]
    p_y = df_y['prob'].sum()
    
    l = [
        df_y[df_y[col] == xs[col]]['prob'].sum()
        for col in xs
    ]
    
    return p_y * reduce(lambda x, y: x * y, l)

def naive_bayes(df, y, xs):
    df_y = df[df['G'] == y]
    p_y = df_y['prob'].sum()
    
    l = [
        df_y[df_y[col] == xs[col]]['prob'].sum()
        for col in xs
    ]
    
    num = p_y * reduce(lambda x, y: x * y, l)
    den = sum([
        reduce(lambda x, y: x * y, [
            df[(df['G'] == y_val) & (df[col] == xs[col])]['prob'].sum()
            for col in xs
        ])
        for y_val in df['G'].unique()
    ])
    
    return num / den

Joint distribution

$$p(Y, X_1, ..., X_N)=p(Y)p(X_1|Y)\prod_{i=2}^{n}(X_i|X_1, ..., X_{i-1}, y)$$

In [5]:
joint_probability(df, 'A', {'GPA': '[3.5,4.0]', 'I': 'vh'})


Out[5]:
0.0026136570699708452

In [6]:
joint_probability(df, 'A', {'GPA': '[3.5,4.0]', 'I': 'h'})


Out[6]:
0.0061497813411078702

In [7]:
joint_probability(df, 'A', {'GPA': '[3,3.5)', 'I': 'h'})


Out[7]:
0.0061497813411078702

In [8]:
joint_probability(df, 'A', {'GPA': '[3,3.5)', 'I': 'vh'})


Out[8]:
0.023522913629737612

With conditional independence

$X_1, ..., X_n$ conditionally independent given $Y$

$$p(Y, X_1, ..., X_N)=p(Y)\prod_{i=1}^{n}(X_i|Y)$$

Need 2n + 1 parameters: 1 for Y~Ber(p) and 2 for each variable because there are two possible values of Y


In [9]:
joint_probability_with_cond_ind(df, 'A', {'GPA': '[3.5,4.0]', 'I': 'vh'})


Out[9]:
0.013068285349854226

In [10]:
joint_probability_with_cond_ind(df, 'A', {'GPA': '[3.5,4.0]', 'I': 'h'})


Out[10]:
0.007687226676384838

Naive Bayes


In [11]:
naive_bayes(df, 'A', {'GPA': '[3.5,4.0]', 'I': 'vh'})


Out[11]:
0.2948355601233299

In [12]:
naive_bayes(df, 'B', {'GPA': '[3.5,4.0]', 'I': 'vh'})


Out[12]:
0.20118191161356627

In [13]:
naive_bayes(df, 'A', {'GPA': '[3.5,4.0]', 'I': 'h'})


Out[13]:
0.29044750430292593

In [14]:
naive_bayes(df, 'B', {'GPA': '[3.5,4.0]', 'I': 'h'})


Out[14]:
0.205895008605852

In [15]:
naive_bayes(df, 'A', {'GPA': '[0,3)', 'I': 'h'})


Out[15]:
0.16313104189044036

In [16]:
naive_bayes(df, 'B', {'GPA': '[0,3)', 'I': 'h'})


Out[16]:
0.34264232008592915

In [ ]: