In [1]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
%matplotlib inline
%pylab inline
In [4]:
import numpy as np
In [6]:
import statsmodels.api as sm
In [7]:
import statsmodels
In [10]:
statsmodels.__version__
Out[10]:
In [11]:
import statsmodels.formula.api as smf
In [12]:
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
In [13]:
dat
Out[13]:
In [14]:
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
In [15]:
results.summary()
Out[15]:
In [16]:
import pandas
In [25]:
data = pandas.read_csv('brain_size.csv', sep=';', na_values=".")
In [26]:
data
Out[26]:
In [19]:
import numpy as np
t = np.linspace(-6, 6, 20)
sin_t = np.sin(t)
cos_t = np.cos(t)
In [20]:
pandas.DataFrame({'t': t, 'sin': sin_t, 'cos': cos_t})
Out[20]:
In [21]:
import statsmodels.formula.api as smf
In [23]:
model = smf.ols("VIQ ~ Gender + 1", data).fit()
In [24]:
model.summary()
Out[24]:
Do the observations come from a particular distribution?
In [35]:
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.random.normal.html
import numpy as np
In [36]:
np.random.normal?
In [39]:
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000)
In [40]:
np.mean(s)
Out[40]:
In [41]:
np.std(s, ddof=1)
Out[41]:
In [42]:
import matplotlib.pyplot as plt
In [43]:
count, bins, ignored = plt.hist(s, 30, normed=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
linewidth=2, color='r')
Out[43]:
In [44]:
from scipy.stats import norm
In [91]:
norm?
# norm.pdf?
In [84]:
loc = 25
scale = 4
norm.pdf(10, loc, scale)
Out[84]:
In [85]:
x = np.linspace(norm.ppf(0.01, loc, scale), norm.ppf(0.99, loc, scale), 100)
In [86]:
norm.ppf(0.99, loc, scale)
Out[86]:
In [87]:
norm.ppf(0.01, loc, scale)
Out[87]:
In [93]:
plt.plot(x, norm.pdf(x, loc, scale), 'r-', lw=5, alpha=0.6)
plt.ylabel('PDF')
plt.xlabel('Age')
Out[93]:
In [98]:
ages = np.linspace(17, 90, 100)
In [102]:
loc
Out[102]:
In [181]:
threshold = 0.01
age_threshold = 0.01
def is_young(age):
return norm.pdf(age, loc=20, scale=3)
def good_age(age):
return norm.pdf(age, loc=45, scale=5)
def is_old(age):
return norm.pdf(age, loc=90, scale=8)
plt.plot(ages, is_young(ages), 'r', lw=2, alpha=0.5)
plt.plot(ages, good_age(ages), 'g', lw=2, alpha=0.5)
plt.plot(ages, is_old(ages), 'r', lw=2, alpha=0.5)
plt.plot(ages, np.full(100, threshold), 'gray', lw=2, alpha=0.5)
plt.ylabel('PDF')
plt.xlabel('Age')
Out[181]:
In [128]:
norm.pdf(90, loc=90, scale=6)
Out[128]:
In [148]:
def predict_on_age(age):
if is_young(age) > threshold or is_old(age) > threshold:
return 0
if good_age(age) > threshold :
return 1
return 2
In [160]:
predict_on_age(32)
Out[160]:
In [205]:
kms = np.linspace(0, 100, 100)
kms_threshold = 0.005
def no_practice(km):
return norm.pdf(km, loc=1, scale=3)
def much_driving(km):
return norm.pdf(km, loc=100, scale=20)
def sweet_spot(km):
return norm.pdf(km, loc=20, scale=5)
plt.plot(kms, no_practice(kms), 'r', lw=2, alpha=0.5)
plt.plot(kms, much_driving(kms), 'r', lw=2, alpha=0.5)
plt.plot(kms, sweet_spot(kms), 'g', lw=2, alpha=0.5)
plt.plot(kms, np.full(100, kms_threshold), 'gray', lw=2, alpha=0.5)
plt.ylabel('PDF')
plt.xlabel('thousand km per year')
Out[205]:
In [176]:
kmhs = np.linspace(90, 250, 100)
kmhs_threshold = 0.002
def too_fast(kmh):
return norm.pdf(kmh, loc=250, scale=30)
plt.plot(kmhs, too_fast(kmhs), 'r', lw=2, alpha=0.5)
plt.plot(kmhs, np.full(100, kmhs_threshold), 'gray', lw=2, alpha=0.5)
plt.ylabel('PDF')
plt.xlabel('km/h')
Out[176]:
In [187]:
def predict(x):
speed, age, km_per_year = x
if (is_young(age) > age_threshold or is_old(age) > age_threshold
or too_fast(speed) > kmhs_threshold
or no_practice(km_per_year) > kms_threshold or much_driving(km_per_year) > kms_threshold):
return 0
if good_age(age) > age_threshold or sweet_spot(km_per_year) > kms_threshold:
return 1
return 2
In [197]:
predict([190, 47, 10])
Out[197]:
In [198]:
age_factor = 1
kmhs_factor = 1
kms_factor = 1
def scoring(x):
speed, age, km_per_year = x
pos_score = good_age(age) * age_factor + sweet_spot(km_per_year) * kms_factor
neg_score = (is_young(age) * age_factor + is_old(age) * age_factor
+ too_fast(speed) * kmhs_factor
+ no_practice(km_per_year) * kms_factor + much_driving(km_per_year) * kms_factor)
return pos_score - neg_score
In [200]:
scoring([190, 47, 10])
Out[200]:
In [203]:
scoring([250, 87, 100])
Out[203]:
In [204]:
# as good as it gets
scoring([100, 47, 20])
Out[204]:
In [206]:
# nightmare customer
scoring([250, 90, 1])
Out[206]:
In [207]:
# nightmare customer
scoring([180, 35, 25])
Out[207]:
In [209]:
score_threshold = 0.005
def predict_for_score(x):
score = scoring(x)
if abs(score) < score_threshold:
return 2
if score < 0:
return 1
return 0
In [210]:
predict_for_score([190, 47, 10])
Out[210]:
In [211]:
predict_for_score([250, 87, 100])
Out[211]:
In [ ]:
scoring([100, 47, 20])