In [250]:

    
import random
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import math
%matplotlib inline

Gaussian distribution

Generation sample by rolling dices many times



In [225]:

    
def rollDice():
    return random.randint(1, 6)
    
def generate():
    N = 10000
    trials = {}
    for i in range(N):
        d1 = rollDice()
        d2 = rollDice()
        s = d1 + d2
        trials[s] = trials.get(s, 0) + 1
    return trials



In [226]:

    
data = generate()
plt.bar(list(data.keys()), list(data.values()))
plt.xticks(range(1, 12))
plt.show()

Cumulative Distribution



In [228]:

    
def calcCumulativeDist(data):
    s = {}
    for i, v in enumerate(data.values()):
        s[i] = s.get(i - 1, 0) + v
    return s

def takeSamples(rangeFrom, rangeTo, n):
    return [random.randint(rangeFrom, rangeTo) for i in range(n)]

cumulativDist = calcCumulativeDist(data)

plt.plot(list(cumulativDist.keys()), list(cumulativDist.values()))
plt.show()

Parameter Estimation

We try to find mean and variance using given data points.



In [212]:

    
data = generate()
data









    Out[212]:





{2: 275,
 3: 568,
 4: 828,
 5: 1112,
 6: 1399,
 7: 1666,
 8: 1396,
 9: 1138,
 10: 815,
 11: 550,
 12: 253}

1. Finding mean

Mean is simply the average of data points. N is the number of data points. $$ \mu = \frac{1}{N} \sum_{i}^{N} x_{i} \\ $$



In [213]:

    
N = np.sum(list(data.values()))



In [214]:

    
m = sum(k*v for k, v in data.items()) / N
m









    Out[214]:





6.9828000000000001

2. Finding variance

To find the variance we sum the squared differences from the mean. Finally we take the average of the sum. $$ \sigma^{2} = \frac{1}{N} \sum_{i}^{N}(x_{i} - \mu)^{2} \\ $$



In [215]:

    
variance = sum(np.power(k - m, 2) for k, v in data.items() for i in range(v)) / N
variance









    Out[215]:





5.7667041600002023

Gaussian (Normal) distribution

Say $ X \in \mathbb{R} $ if x is distirbuted Gaussian with mean $\mu$, variance $\sigma^{2}$. $$ X \sim \mathcal{N}(\mu,\,\sigma^{2}) $$

$$ p(x, \mu, \sigma^{2}) = \frac{1}{\sqrt{2\pi } \ \sigma} exp(-\frac{(x-\mu)^{2}}{2\sigma^{2}}) $$



In [216]:

    
def calc_p(x, m, variance):
    return 1/ np.sqrt(2*np.pi * np.sqrt(variance)) * np.exp(- np.power(x-m, 2) / 2*variance)



In [222]:

    
ps = dict((i, calc_p(i, m, variance)) for i in np.linspace(1, 12, 120))

plt.plot(list(ps.keys()), list(ps.values()))
plt.xticks(range(1, 13))
plt.show()



In [218]:

    
dict((i, calc_p(i, m, variance)) for i in range(1, 13))









    Out[218]:





{1: 3.8788803574085011e-46,
 2: 2.0897424126511052e-32,
 3: 3.523962956915412e-21,
 4: 1.8600392838703317e-12,
 5: 3.0730179930772096e-06,
 6: 0.015891331753233252,
 7: 0.25722175075408732,
 8: 0.013031886536201498,
 9: 2.066612704133072e-06,
 10: 1.0258009668963748e-12,
 11: 1.593746771096938e-21,
 12: 7.7504645873222998e-33}

Density Estimation & Cumulative Density Function

What is the probability of rolling two dice whose sum is between 2 and 8? Cumulative distribution function is as follows: $$ \phi(x, \mu, \sigma) = \frac{1}{2} \ (1 + erf( \frac{x-\mu}{\sigma \sqrt{2}} ) ) $$



In [233]:

    
def cumulativeCDF(x, m, stdv):
    return 0.5 * (1 + math.erf((x-m)/ (stdv * math.sqrt(2)) ))



In [242]:

    
print(cumulativeCDF(8, m, np.sqrt(variance)))









    



0.6640664428560954



In [249]:

    
plt.plot(
    np.linspace(1, 13, 130), 
    list(cumulativeCDF(i, m, np.sqrt(variance))  for i in np.linspace(1, 13, 130)))
plt.grid(True)
plt.show()



In [256]:

    
df = pd.read_csv("data/student/student-mat.csv", ";")
df.sample(5)









    Out[256]:






  
    
      
      school
      sex
      age
      address
      famsize
      Pstatus
      Medu
      Fedu
      Mjob
      Fjob
      ...
      famrel
      freetime
      goout
      Dalc
      Walc
      health
      absences
      G1
      G2
      G3
    
  
  
    
      294
      GP
      M
      18
      R
      LE3
      T
      3
      2
      services
      other
      ...
      5
      4
      2
      1
      1
      4
      8
      14
      13
      14
    
    
      159
      GP
      M
      16
      U
      GT3
      T
      3
      3
      other
      services
      ...
      4
      5
      5
      4
      4
      5
      4
      10
      12
      12
    
    
      368
      MS
      F
      18
      U
      GT3
      T
      2
      3
      at_home
      services
      ...
      5
      2
      3
      1
      2
      4
      0
      11
      10
      10
    
    
      195
      GP
      F
      17
      U
      LE3
      T
      2
      4
      services
      services
      ...
      4
      3
      2
      1
      1
      5
      0
      14
      15
      15
    
    
      283
      GP
      F
      18
      U
      GT3
      T
      1
      1
      other
      other
      ...
      5
      4
      4
      1
      1
      4
      4
      8
      9
      10
    
  

5 rows × 33 columns



In [268]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
school        395 non-null object
sex           395 non-null object
age           395 non-null int64
address       395 non-null object
famsize       395 non-null object
Pstatus       395 non-null object
Medu          395 non-null int64
Fedu          395 non-null int64
Mjob          395 non-null object
Fjob          395 non-null object
reason        395 non-null object
guardian      395 non-null object
traveltime    395 non-null int64
studytime     395 non-null int64
failures      395 non-null int64
schoolsup     395 non-null object
famsup        395 non-null object
paid          395 non-null object
activities    395 non-null object
nursery       395 non-null object
higher        395 non-null object
internet      395 non-null object
romantic      395 non-null object
famrel        395 non-null int64
freetime      395 non-null int64
goout         395 non-null int64
Dalc          395 non-null int64
Walc          395 non-null int64
health        395 non-null int64
absences      395 non-null int64
G1            395 non-null int64
G2            395 non-null int64
G3            395 non-null int64
dtypes: int64(16), object(17)
memory usage: 101.9+ KB



In [322]:

    
df[["G1", "G3", "G2"]].plot.hist(stacked=True, bins=20,alpha=0.5)









    Out[322]:





<matplotlib.axes._subplots.AxesSubplot at 0x10c4cce48>



In [313]:

    
def is_anomoly(g1, g2, g3, df):
    g1_d = df["G1"].describe()
    g2_d = df["G2"].describe()
    g3_d = df["G3"].describe()
    print(g1_d)
    print(g2_d)
    print(g3_d)
    p1 = calc_p(g1, g1_d["mean"], np.power(g1_d["std"], 2))
    p2 = calc_p(g2, g2_d["mean"], np.power(g2_d["std"], 2))
    p3 = calc_p(g3, g3_d["mean"], np.power(g3_d["std"], 2))
    print(p1)
    print(p2)
    print(p3)
    print(np.log(p1)+np.log(p2)+np.log(p3))
is_anomoly(10.908861, 10.713924, 10.415190, df)









    



count    395.000000
mean      10.908861
std        3.319195
min        3.000000
25%        8.000000
50%       11.000000
75%       13.000000
max       19.000000
Name: G1, dtype: float64
count    395.000000
mean      10.713924
std        3.761505
min        0.000000
25%        9.000000
50%       11.000000
75%       13.000000
max       19.000000
Name: G2, dtype: float64
count    395.000000
mean      10.415190
std        4.581443
min        0.000000
25%        8.000000
50%       11.000000
75%       14.000000
max       20.000000
Name: G3, dtype: float64
0.218974580748
0.20569761857
0.186384134117
-4.78009318289



In [334]:

    
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df[
    ["G1", "G2", "G3", "age", "Walc", "goout", "absences", "studytime","famsize"]], alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()



In [ ]:

	school	sex	age	address	famsize	Pstatus	Medu	Fedu	Mjob	Fjob	...	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
294	GP	M	18	R	LE3	T	3	2	services	other	...	5	4	2	1	1	4	8	14	13	14
159	GP	M	16	U	GT3	T	3	3	other	services	...	4	5	5	4	4	5	4	10	12	12
368	MS	F	18	U	GT3	T	2	3	at_home	services	...	5	2	3	1	2	4	0	11	10	10
195	GP	F	17	U	LE3	T	2	4	services	services	...	4	3	2	1	1	5	0	14	15	15
283	GP	F	18	U	GT3	T	1	1	other	other	...	5	4	4	1	1	4	4	8	9	10