notebook.community

Edit and run



In [3]:

    
%matplotlib inline
import random
import numpy as np
import matplotlib.pyplot as plt
import math
import matplotlib.mlab as mlab
import scipy.stats as ss

"""
CDF - Using Different Continuous Function to generate the distribution
"""



In [59]:

    
# Continuous Function 1 - Exponential Function: CDF(x) = 1-exp(-lambda*x)
## Complementary CDF (CCDF): CCDF(x) = exp(-lambda*x) 

lambda1 = 0.77
lambda2 = 0.99
x = np.linspace(7, 9, 100)  # generate 100 evenly spaced number between 7 & 9
y1 = np.exp(-lambda1*x)
y2 = np.exp(-lambda2*x)

plt.figure()
plt.plot(x, y1, c='green', label='lambda=0.77')
plt.plot(x, y2, c='purple', label='lambda=0.99')
plt.xlabel('$x$')
plt.ylabel('$\exp(-lambda*x)$')
plt.legend()
plt.show()



In [61]:

    
# If we take log of y in CCDF, we should be able to get linear line, because ln(y) = -lambda*x
## The plot here is to prove we can get linear lines
logy1 = [math.log(elem) for elem in y1]
logy2 = [math.log(elem) for elem in y2]

plt.figure()
plt.plot(x, logy1, c='green', label='lambda=0.77')
plt.plot(x, logy2, c='purple', label='lambda=0.99')
plt.xlabel('$x$')
plt.ylabel('$-lambda*x$')
plt.legend()
plt.show()



In [83]:

    
# Python random expovariate() generates numbers that are exponential distributed
random.seed(410)
cdf = [random.expovariate(7) for r in range(100)]  # lambda=7 here
sorted_data1 = np.sort(cdf)
plt.step(sorted_data1, np.arange(sorted_data1.size), c='purple', label='CDF')
plt.legend()
plt.show()



In [84]:

    
random.seed(410)
ccdf = [-random.expovariate(7)+1 for r in range(100)]  # lambda=7 here
sorted_data2 = np.sort(ccdf)
plt.step(sorted_data2, np.arange(sorted_data2.size), c='green', label='CCDF')
plt.legend()
plt.show()



In [85]:

    
# Pareto Distributon
# CDF(x) = 1- pow(x/xm, -alpha)
# CCDF(x) = pow(x/xm, -alpha)

## In python paretovariate, you can set alpha, xm=1 by default. You can use x/xm if your xm is not 1
random.seed(410)
cdf = [random.paretovariate(7) for r in range(100)]  # alpha=7 here
sorted_data1 = np.sort(cdf)
plt.step(sorted_data1, np.arange(sorted_data1.size), c='purple', label='CDF')
plt.legend()
plt.show()



In [86]:

    
random.seed(410)
ccdf = [-random.paretovariate(7)+1 for r in range(100)]  # alpha=7 here
sorted_data2 = np.sort(ccdf)
plt.step(sorted_data2, np.arange(sorted_data2.size), c='green', label='CCDF')
plt.legend()
plt.show()



In [101]:

    
# Normal Distribution
x = np.arange(-10, 11)
xU, xL = x + 0.5, x - 0.5 
## scale is standard deviation, loc is mean (0 by default)
prob = ss.norm.cdf(xU, scale = 7) - ss.norm.cdf(xL, scale = 7)  
prob = prob / prob.sum()  # normalize so that the sum is 1

sorted_data = np.sort(prob)
plt.step(np.arange(prob.size), prob, c='green', label='normal distribution')
plt.legend()
plt.show()



In [106]:

    
# CDF with a normal distribution
from scipy.special import erf

high = 9   # how many standard deviation above the mean
low = -high  # how many standard deviation below the mean
digits=2
n = (high - low) * 10**digits + 1
xs = np.linspace(low, high, n)
ps = (erf(xs / math.sqrt(2.0)) + 1) / 2

plt.step(xs, ps, c='green', label='normal distribution')
plt.legend()
plt.show()

# reference: http://greenteapress.com/thinkstats/erf.py



In [10]:

    
# This is Adult Intelligence scale: mu=100, sigma=15
## k means it the min, max exceeds mean k standard deviation. k=6 the highest IQ is 190

mu = 100
sigma = 15
k=6
x = np.linspace(mu - k*sigma, mu + k*sigma, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma),c='green')
plt.show()



In [12]:

    
import pandas as pd

preg_raw = pd.read_csv("2002FemPregOut.csv")
preg_raw = preg_raw.dropna()
preg_raw.head()









    Out[12]:







  
    
      
      caseid
      nbrnaliv
      babysex
      birthwgt_lb
      birthwgt_oz
      prglngth
      outcome
      birthord
      agepreg
      finalwgt
    
  
  
    
      0
      1
      1.0
      1.0
      8.0
      13.0
      39
      1
      1.0
      3316.0
      6448.271112
    
    
      1
      1
      1.0
      2.0
      7.0
      14.0
      39
      1
      2.0
      3925.0
      6448.271112
    
    
      2
      2
      3.0
      1.0
      9.0
      2.0
      39
      1
      1.0
      1433.0
      12999.542264
    
    
      3
      2
      1.0
      2.0
      7.0
      0.0
      39
      1
      2.0
      1783.0
      12999.542264
    
    
      4
      2
      1.0
      2.0
      6.0
      3.0
      39
      1
      3.0
      1833.0
      12999.542264



In [32]:

    
# Looks like normal distribution
sorted_data = np.sort(preg_raw['prglngth'])
plt.step(sorted_data, np.arange(sorted_data.size)) 
plt.show()



In [25]:

    
# Now plot a normal distribution with the same mean and standard deviation
mu = preg_raw['prglngth'].mean()
sigma = preg_raw['prglngth'].std()
print mu, sigma
k=15  # mu-k*sigma=0 in this case
x = np.linspace(mu - k*sigma, mu + k*sigma, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma),c='green')
plt.show()









    



38.5680642676 2.68779888561



In [39]:

    
# Compute rankits
# You have multiple samples of normal distribution, zip ith values together as ith list, get mean for each list,
# this mean is the rankit.
# ith rankit is the mean of ith values in multiple normal distribution samples
## reference: http://greenteapress.com/thinkstats/rankit.py
from statistics import mean

def get_sorted_normal_sample(sample_size):
    s = [random.normalvariate(0,1.0) for i in range(sample_size)]
    s.sort()
    return s

def get_sample_set(single_sample_size, sample_count):
    sample_set = [get_sorted_normal_sample(single_sample_size) for j in range(sample_count)]
    return sample_set

def get_rankits(single_sample_size, sample_count):
    t = get_sample_set(single_sample_size, sample_count)
    t = zip(*t)
    means = [mean(elem) for elem in t]
    return means

def get_sorted_lognormal_sample(sample_size):
    s = [math.log(abs(random.normalvariate(7,9))) for i in range(sample_size)]
    s.sort()
    return s

def get_lognormal_sample_set(single_sample_size, sample_count):
    sample_set = [get_sorted_lognormal_sample(single_sample_size) for j in range(sample_count)]
    return sample_set

def get_logrankits(single_sample_size, sample_count):
    t = get_lognormal_sample_set(single_sample_size, sample_count)
    t = zip(*t)
    means = [mean(elem) for elem in t]
    return means



In [26]:

    
# When the plot is closer to a straight line with x-axis and y-axis are in the same range & scale,
## it's closer to a normal distribution

# Normal probability plot - sorted data sample (x-axis), rankits from normal distribution samples
random.seed(410)
single_sample_size=7 
sample_count = 9
rankits = get_rankits(single_sample_size, sample_count)

xs = [random.normalvariate(0.0, 1.0) for i in range(sample_size)]
plt.clf()
plt.plot(sorted(xs), rankits, c='green')









    Out[26]:





[<matplotlib.lines.Line2D at 0x10fae59d0>]



In [40]:

    
# lognormal distribution
# log the values in normal distribution

random.seed(410)
single_sample_size=7 
sample_count = 9
rankits = get_logrankits(single_sample_size, sample_count)

xs = [math.log(abs(random.normalvariate(7, 9))) for i in range(sample_size)]
plt.clf()
plt.plot(sorted(xs), rankits, c='green')









    Out[40]:





[<matplotlib.lines.Line2D at 0x10fc4e390>]



In [ ]:

    
"""
To sum up what I did above:
1. Python methods to generate random values based on different CDF continuous function
2. Plot different CDF
3. Mehtods to tell whether a certain dataset satisfy a certain distribution

Knowing CDF continuous function formula, we can also generate random numbers based on a certain distribution,
but it's easier to use pyton built-in functions
"""

	caseid	nbrnaliv	babysex	birthwgt_lb	birthwgt_oz	prglngth	outcome	birthord	agepreg	finalwgt
0	1	1.0	1.0	8.0	13.0	39	1	1.0	3316.0	6448.271112
1	1	1.0	2.0	7.0	14.0	39	1	2.0	3925.0	6448.271112
2	2	3.0	1.0	9.0	2.0	39	1	1.0	1433.0	12999.542264
3	2	1.0	2.0	7.0	0.0	39	1	2.0	1783.0	12999.542264
4	2	1.0	2.0	6.0	3.0	39	1	3.0	1833.0	12999.542264