Conditional Probabilities

George Tzanetakis, University of Victoria

In this notebook we explore conditional probabilities

Define a helper random variable class based on the scipy discrete random variable functionality providing both numeric and symbolic RVs


In [2]:
%matplotlib inline 
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

In [3]:
class Random_Variable: 
    
    def __init__(self, name, values, probability_distribution): 
        self.name = name 
        self.values = values 
        self.probability_distribution = probability_distribution 
        if all(type(item) is np.int64 for item in values): 
            self.type = 'numeric'
            self.rv = stats.rv_discrete(name = name, values = (values, probability_distribution))
        elif all(type(item) is str for item in values): 
            self.type = 'symbolic'
            self.rv = stats.rv_discrete(name = name, values = (np.arange(len(values)), probability_distribution))
            self.symbolic_values = values 
        else: 
            self.type = 'undefined'
    
    def sample(self,size): 
        if (self.type =='numeric'): 
            return self.rv.rvs(size=size)
        elif (self.type == 'symbolic'): 
            numeric_samples = self.rv.rvs(size=size)
            mapped_samples = [self.values[x] for x in numeric_samples]
            return mapped_samples

In [14]:
# samples to generate 
num_samples = 100

## Prior probabilities of a song being jazz or country 
values = ['country', 'jazz']
probs = [0.7, 0.3]
genre = Random_Variable('genre',values, probs)

# conditional probabilities of a song having lyrics or not given the genre 
values = ['no', 'yes']
probs = [0.9, 0.1] 
lyrics_if_jazz = Random_Variable('lyrics_if_jazz', values, probs)

values = ['no', 'yes']
probs = [0.2, 0.8]
lyrics_if_country = Random_Variable('lyrics_if_country', values, probs)

# generating proces first sample prior and then based on outcome 
# choose which conditional probability distribution to use 

random_lyrics_samples = [] 
for n in range(num_samples): 
    random_genre_sample = genre.sample(1)[0]
    if (random_genre_sample == 'jazz'): 
        random_lyrics_sample = (lyrics_if_jazz.sample(1)[0], 'jazz')
    else: 
        random_lyrics_sample = (lyrics_if_country.sample(1)[0], 'country')
    random_lyrics_samples.append(random_lyrics_sample)

random_lyrics_samples


Out[14]:
[('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('yes', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('no', 'country'),
 ('yes', 'country'),
 ('yes', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'country'),
 ('no', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('no', 'country'),
 ('no', 'country'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('yes', 'country'),
 ('no', 'jazz'),
 ('no', 'jazz'),
 ('yes', 'country')]

In [15]:
# Let's now estimate the conditional probabilities using the generated samples 

# First only consider jazz 
jazz_samples = [x for x in random_lyrics_samples if x[1] == 'jazz']

# estimate the probability of an event specified 
# as a predicated over the possible outcomes 
def estimate_event_probability(f, samples): 
    return len(list(filter(f, samples))) / len(samples)

est_no = len([x for x in jazz_samples if x[0] == 'no']) / len(jazz_samples)
est_yes = len([x for x in jazz_samples if x[0] == 'yes']) / len(jazz_samples)
print(est_no, est_yes)


0.9032258064516129 0.0967741935483871