In [250]:
import random
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import math
%matplotlib inline
In [225]:
def rollDice():
return random.randint(1, 6)
def generate():
N = 10000
trials = {}
for i in range(N):
d1 = rollDice()
d2 = rollDice()
s = d1 + d2
trials[s] = trials.get(s, 0) + 1
return trials
In [226]:
data = generate()
plt.bar(list(data.keys()), list(data.values()))
plt.xticks(range(1, 12))
plt.show()
In [228]:
def calcCumulativeDist(data):
s = {}
for i, v in enumerate(data.values()):
s[i] = s.get(i - 1, 0) + v
return s
def takeSamples(rangeFrom, rangeTo, n):
return [random.randint(rangeFrom, rangeTo) for i in range(n)]
cumulativDist = calcCumulativeDist(data)
plt.plot(list(cumulativDist.keys()), list(cumulativDist.values()))
plt.show()
In [212]:
data = generate()
data
Out[212]:
In [213]:
N = np.sum(list(data.values()))
In [214]:
m = sum(k*v for k, v in data.items()) / N
m
Out[214]:
In [215]:
variance = sum(np.power(k - m, 2) for k, v in data.items() for i in range(v)) / N
variance
Out[215]:
In [216]:
def calc_p(x, m, variance):
return 1/ np.sqrt(2*np.pi * np.sqrt(variance)) * np.exp(- np.power(x-m, 2) / 2*variance)
In [222]:
ps = dict((i, calc_p(i, m, variance)) for i in np.linspace(1, 12, 120))
plt.plot(list(ps.keys()), list(ps.values()))
plt.xticks(range(1, 13))
plt.show()
In [218]:
dict((i, calc_p(i, m, variance)) for i in range(1, 13))
Out[218]:
In [233]:
def cumulativeCDF(x, m, stdv):
return 0.5 * (1 + math.erf((x-m)/ (stdv * math.sqrt(2)) ))
In [242]:
print(cumulativeCDF(8, m, np.sqrt(variance)))
In [249]:
plt.plot(
np.linspace(1, 13, 130),
list(cumulativeCDF(i, m, np.sqrt(variance)) for i in np.linspace(1, 13, 130)))
plt.grid(True)
plt.show()
In [256]:
df = pd.read_csv("data/student/student-mat.csv", ";")
df.sample(5)
Out[256]:
In [268]:
df.info()
In [322]:
df[["G1", "G3", "G2"]].plot.hist(stacked=True, bins=20,alpha=0.5)
Out[322]:
In [313]:
def is_anomoly(g1, g2, g3, df):
g1_d = df["G1"].describe()
g2_d = df["G2"].describe()
g3_d = df["G3"].describe()
print(g1_d)
print(g2_d)
print(g3_d)
p1 = calc_p(g1, g1_d["mean"], np.power(g1_d["std"], 2))
p2 = calc_p(g2, g2_d["mean"], np.power(g2_d["std"], 2))
p3 = calc_p(g3, g3_d["mean"], np.power(g3_d["std"], 2))
print(p1)
print(p2)
print(p3)
print(np.log(p1)+np.log(p2)+np.log(p3))
is_anomoly(10.908861, 10.713924, 10.415190, df)
In [334]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df[
["G1", "G2", "G3", "age", "Walc", "goout", "absences", "studytime","famsize"]], alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()
In [ ]: