In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [4]:
import numpy as np

In [6]:
import statsmodels.api as sm

In [7]:
import statsmodels

In [10]:
statsmodels.__version__


Out[10]:
'0.8.0'

In [11]:
import statsmodels.formula.api as smf

In [12]:
dat = sm.datasets.get_rdataset("Guerry", "HistData").data

In [13]:
dat


Out[13]:
dept Region Department Crime_pers Crime_prop Literacy Donations Infants Suicides MainCity ... Crime_parents Infanticide Donation_clergy Lottery Desertion Instruction Prostitutes Distance Area Pop1831
0 1 E Ain 28870 15890 37 5098 33120 35039 2:Med ... 71 60 69 41 55 46 13 218.372 5762 346.03
1 2 N Aisne 26226 5521 51 8901 14572 12831 2:Med ... 4 82 36 38 82 24 327 65.945 7369 513.00
2 3 C Allier 26747 7925 13 10973 17044 114121 2:Med ... 46 42 76 66 16 85 34 161.927 7340 298.26
3 4 E Basses-Alpes 12935 7289 46 2733 23018 14238 1:Sm ... 70 12 37 80 32 29 2 351.399 6925 155.90
4 5 E Hautes-Alpes 17488 8174 69 6962 23076 16171 1:Sm ... 22 23 64 79 35 7 1 320.280 5549 129.10
5 7 S Ardeche 9474 10263 27 3188 42117 52547 1:Sm ... 76 47 67 70 19 62 1 279.413 5529 340.73
6 8 N Ardennes 35203 8847 67 6400 16106 26198 2:Med ... 53 85 49 31 62 9 83 105.694 5229 289.62
7 9 S Ariege 6173 9597 18 3542 22916 123625 1:Sm ... 74 28 63 75 22 77 3 385.313 4890 253.12
8 10 E Aube 19602 4086 59 3608 18642 10989 2:Med ... 77 54 9 28 86 15 207 83.244 6004 246.36
9 11 S Aude 15647 10431 34 2582 20225 66498 2:Med ... 80 35 27 50 63 48 1 370.949 6139 270.13
10 12 S Aveyron 8236 6731 31 3211 21981 116671 2:Med ... 51 5 23 81 10 44 4 296.089 8735 359.06
11 13 S Bouches-du-Rhone 13409 5291 38 2314 9325 8107 3:Lg ... 45 74 55 3 23 43 25 362.568 5087 359.47
12 14 N Calvados 17577 4500 52 27830 8983 31807 2:Med ... 57 56 11 13 12 22 194 117.487 5548 494.70
13 15 C Cantal 18070 11645 31 4093 15335 87338 2:Med ... 79 83 66 82 1 51 20 245.849 5726 258.59
14 16 W Charente 24964 13018 36 13602 19454 25720 2:Med ... 2 7 81 60 61 47 8 224.339 5956 362.53
15 17 W Charente-Inferieure 18712 5357 39 13254 23999 16798 2:Med ... 3 38 72 35 74 42 27 238.538 6864 445.25
16 18 C Cher 21934 10503 13 9561 23574 19497 2:Med ... 69 11 86 44 51 83 26 116.257 7235 256.06
17 19 C Correze 15262 12949 12 14993 19330 47480 2:Med ... 86 16 82 84 2 86 3 227.899 5857 294.83
18 21 E Cote-d'Or 32256 9159 60 2540 15599 16128 2:Med ... 49 27 18 33 78 13 206 136.109 8763 375.88
19 22 W Cotes-du-Nord 28607 7050 16 10387 36098 75056 2:Med ... 6 69 15 72 47 80 16 225.161 6878 598.87
20 23 C Creuse 37014 20235 23 10997 14363 77823 1:Sm ... 75 24 75 85 4 71 12 180.846 5565 265.38
21 24 W Dordogne 21585 10237 18 4687 21375 36024 2:Med ... 64 18 79 77 44 78 3 253.776 9060 482.75
22 25 E Doubs 11560 5914 73 3436 12512 40690 2:Med ... 38 25 6 18 73 2 65 202.065 5234 265.54
23 26 E Drome 13396 7759 42 2829 16348 23816 2:Med ... 21 13 62 54 46 38 8 295.543 6530 299.56
24 27 N Eure 14795 4774 51 11712 16039 13493 2:Med ... 39 45 45 47 27 23 179 61.863 6040 424.25
25 28 C Eure-et-Loir 21368 4016 54 4553 14475 15015 2:Med ... 18 62 14 48 72 18 180 54.558 5880 278.82
26 29 W Finistere 29872 6842 15 23945 28392 25143 2:Med ... 24 78 25 36 77 81 42 276.210 6733 524.40
27 30 S Gard 13115 7990 40 3048 28726 18292 2:Med ... 15 39 59 20 40 40 5 323.004 5853 357.38
28 31 S Haute-Garonne 18642 7204 31 2286 15378 56140 3:Lg ... 62 59 13 25 15 33 8 361.668 6257 427.86
29 32 S Gers 18642 10486 38 2848 15250 61510 2:Med ... 43 13 32 74 30 44 1 343.569 6309 312.16
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
56 59 N Nord 26740 6175 45 6092 8926 13851 3:Lg ... 14 81 38 7 64 30 308 106.335 5743 989.94
57 60 N Oise 28180 6659 54 5501 18021 5994 2:Med ... 31 86 50 43 57 20 337 33.768 5860 397.73
58 61 N Orne 28329 8248 45 9242 20852 34069 2:Med ... 29 50 31 57 25 33 117 97.554 6103 441.88
59 62 N Pas-de-Calais 23101 4040 49 5740 10575 15400 2:Med ... 36 79 10 27 48 26 163 104.400 6671 655.22
60 63 C Puy-de-Dome 17256 12141 19 5963 22948 78148 2:Med ... 42 63 61 53 8 76 62 205.218 7970 573.11
61 64 W Basses-Pyrenees 16722 8533 47 3299 12393 65995 2:Med ... 34 72 60 34 7 28 12 387.935 7645 428.40
62 65 S Hautes-Pyrenees 12223 9797 53 6001 12125 148039 2:Med ... 85 75 71 76 20 21 5 386.559 4464 233.03
63 66 S Pyrenees-Orientales 6728 7632 31 11644 15167 37843 2:Med ... 67 84 77 11 18 52 5 403.445 4116 157.05
64 67 E Bas-Rhin 12309 4920 62 14472 14356 18623 3:Lg ... 23 48 51 5 53 12 101 217.752 4755 540.21
65 68 E Haut-Rhin 7343 4915 71 6001 14783 21233 2:Med ... 40 53 17 10 56 5 26 217.971 3525 424.26
66 69 E Rhone 18793 4504 45 1983 3910 17003 3:Lg ... 37 33 21 2 14 31 104 213.032 3249 434.43
67 70 E Haute-Saone 22339 7770 59 11701 11850 39714 1:Sm ... 25 68 57 65 83 14 99 176.135 5360 338.91
68 71 E Saone-et-Loire 28391 10708 32 3710 20442 22184 2:Med ... 11 10 58 45 31 49 40 168.713 8575 523.97
69 72 C Sarthe 33913 8294 30 3357 10779 29280 2:Med ... 41 57 19 49 75 35 79 108.294 6206 457.37
70 75 N Seine 13945 1368 71 4204 2660 3632 3:Lg ... 60 67 53 1 33 6 4744 0.000 762 935.11
71 76 N Seine-Inferieure 18355 2906 43 7245 7506 9523 3:Lg ... 28 61 74 9 36 35 546 75.658 6278 693.68
72 77 N Seine-et-Marne 22201 5786 54 5303 16324 7315 2:Med ... 16 73 26 29 67 37 453 27.647 5915 323.89
73 78 N Seine-et-Oise 12477 3879 56 4007 16303 3460 2:Med ... 10 30 24 6 42 17 874 16.888 5334 448.18
74 79 W Deux-Sevres 18400 6863 41 16956 25461 24533 2:Med ... 30 4 85 71 84 39 6 188.474 5999 297.85
75 80 N Somme 33592 7144 44 4964 12447 12836 2:Med ... 7 64 33 30 80 34 302 69.520 6170 543.70
76 81 S Tarn 13019 6241 20 3449 29305 68980 2:Med ... 13 9 47 67 17 73 3 328.146 5758 333.84
77 82 S Tarn-et-Garonne 14790 8680 25 4558 23771 48317 2:Med ... 66 41 52 64 39 64 4 313.090 3718 242.51
78 83 S Var 13145 9572 23 2449 14800 13380 2:Med ... 55 49 40 26 52 69 6 389.512 5973 317.50
79 84 S Vaucluse 13576 5731 37 1246 17239 19024 2:Med ... 61 76 54 8 41 45 2 337.215 3567 239.11
80 85 W Vendee 20827 7566 28 14035 62486 67963 1:Sm ... 50 44 30 68 79 59 4 212.459 6720 330.36
81 86 W Vienne 15010 4710 25 8922 35224 21851 2:Med ... 20 1 44 40 38 65 18 170.523 6990 282.73
82 87 C Haute-Vienne 16256 6402 13 13817 19940 33497 2:Med ... 68 6 78 55 11 84 7 198.874 5520 285.13
83 88 E Vosges 18835 9044 62 4040 14978 33029 2:Med ... 58 34 5 14 85 11 43 174.477 5874 397.99
84 89 C Yonne 18006 6516 47 4276 16616 12789 2:Med ... 32 22 35 51 66 27 272 81.797 7427 352.49
85 200 NaN Corse 2199 4589 49 37015 24743 37016 2:Med ... 81 2 84 83 9 25 1 539.213 8680 195.41

86 rows × 23 columns


In [14]:
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()

In [15]:
results.summary()


Out[15]:
OLS Regression Results
Dep. Variable: Lottery R-squared: 0.348
Model: OLS Adj. R-squared: 0.333
Method: Least Squares F-statistic: 22.20
Date: Thu, 28 Dec 2017 Prob (F-statistic): 1.90e-08
Time: 11:32:47 Log-Likelihood: -379.82
No. Observations: 86 AIC: 765.6
Df Residuals: 83 BIC: 773.0
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 246.4341 35.233 6.995 0.000 176.358 316.510
Literacy -0.4889 0.128 -3.832 0.000 -0.743 -0.235
np.log(Pop1831) -31.3114 5.977 -5.239 0.000 -43.199 -19.424
Omnibus: 3.713 Durbin-Watson: 2.019
Prob(Omnibus): 0.156 Jarque-Bera (JB): 3.394
Skew: -0.487 Prob(JB): 0.183
Kurtosis: 3.003 Cond. No. 702.

In [16]:
import pandas

In [25]:
data = pandas.read_csv('brain_size.csv', sep=';', na_values=".")

In [26]:
data


Out[26]:
Unnamed: 0 Gender FSIQ VIQ PIQ Weight Height MRI_Count
0 1 Female 133 132 124 118.0 64.5 816932
1 2 Male 140 150 124 NaN 72.5 1001121
2 3 Male 139 123 150 143.0 73.3 1038437
3 4 Male 133 129 128 172.0 68.8 965353
4 5 Female 137 132 134 147.0 65.0 951545
5 6 Female 99 90 110 146.0 69.0 928799
6 7 Female 138 136 131 138.0 64.5 991305
7 8 Female 92 90 98 175.0 66.0 854258
8 9 Male 89 93 84 134.0 66.3 904858
9 10 Male 133 114 147 172.0 68.8 955466
10 11 Female 132 129 124 118.0 64.5 833868
11 12 Male 141 150 128 151.0 70.0 1079549
12 13 Male 135 129 124 155.0 69.0 924059
13 14 Female 140 120 147 155.0 70.5 856472
14 15 Female 96 100 90 146.0 66.0 878897
15 16 Female 83 71 96 135.0 68.0 865363
16 17 Female 132 132 120 127.0 68.5 852244
17 18 Male 100 96 102 178.0 73.5 945088
18 19 Female 101 112 84 136.0 66.3 808020
19 20 Male 80 77 86 180.0 70.0 889083
20 21 Male 83 83 86 NaN NaN 892420
21 22 Male 97 107 84 186.0 76.5 905940
22 23 Female 135 129 134 122.0 62.0 790619
23 24 Male 139 145 128 132.0 68.0 955003
24 25 Female 91 86 102 114.0 63.0 831772
25 26 Male 141 145 131 171.0 72.0 935494
26 27 Female 85 90 84 140.0 68.0 798612
27 28 Male 103 96 110 187.0 77.0 1062462
28 29 Female 77 83 72 106.0 63.0 793549
29 30 Female 130 126 124 159.0 66.5 866662
30 31 Female 133 126 132 127.0 62.5 857782
31 32 Male 144 145 137 191.0 67.0 949589
32 33 Male 103 96 110 192.0 75.5 997925
33 34 Male 90 96 86 181.0 69.0 879987
34 35 Female 83 90 81 143.0 66.5 834344
35 36 Female 133 129 128 153.0 66.5 948066
36 37 Male 140 150 124 144.0 70.5 949395
37 38 Female 88 86 94 139.0 64.5 893983
38 39 Male 81 90 74 148.0 74.0 930016
39 40 Male 89 91 89 179.0 75.5 935863

In [19]:
import numpy as np
t = np.linspace(-6, 6, 20)
sin_t = np.sin(t)
cos_t = np.cos(t)

In [20]:
pandas.DataFrame({'t': t, 'sin': sin_t, 'cos': cos_t})


Out[20]:
cos sin t
0 0.960170 0.279415 -6.000000
1 0.609977 0.792419 -5.368421
2 0.024451 0.999701 -4.736842
3 -0.570509 0.821291 -4.105263
4 -0.945363 0.326021 -3.473684
5 -0.955488 -0.295030 -2.842105
6 -0.596979 -0.802257 -2.210526
7 -0.008151 -0.999967 -1.578947
8 0.583822 -0.811882 -0.947368
9 0.950551 -0.310567 -0.315789
10 0.950551 0.310567 0.315789
11 0.583822 0.811882 0.947368
12 -0.008151 0.999967 1.578947
13 -0.596979 0.802257 2.210526
14 -0.955488 0.295030 2.842105
15 -0.945363 -0.326021 3.473684
16 -0.570509 -0.821291 4.105263
17 0.024451 -0.999701 4.736842
18 0.609977 -0.792419 5.368421
19 0.960170 -0.279415 6.000000

In [21]:
import statsmodels.formula.api as smf

In [23]:
model = smf.ols("VIQ ~ Gender + 1", data).fit()

In [24]:
model.summary()


Out[24]:
OLS Regression Results
Dep. Variable: VIQ R-squared: 0.015
Model: OLS Adj. R-squared: -0.010
Method: Least Squares F-statistic: 0.5969
Date: Thu, 28 Dec 2017 Prob (F-statistic): 0.445
Time: 12:07:31 Log-Likelihood: -182.42
No. Observations: 40 AIC: 368.8
Df Residuals: 38 BIC: 372.2
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 109.4500 5.308 20.619 0.000 98.704 120.196
Gender[T.Male] 5.8000 7.507 0.773 0.445 -9.397 20.997
Omnibus: 26.188 Durbin-Watson: 1.709
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3.703
Skew: 0.010 Prob(JB): 0.157
Kurtosis: 1.510 Cond. No. 2.62

In [35]:
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.random.normal.html
import numpy as np

In [36]:
np.random.normal?

In [39]:
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000)

In [40]:
np.mean(s)


Out[40]:
-0.0060416323814836475

In [41]:
np.std(s, ddof=1)


Out[41]:
0.097624752491135836

In [42]:
import matplotlib.pyplot as plt

In [43]:
count, bins, ignored = plt.hist(s, 30, normed=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
                np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
          linewidth=2, color='r')


Out[43]:
[<matplotlib.lines.Line2D at 0x7f98c41df438>]

In [44]:
from scipy.stats import norm

In [91]:
norm?
# norm.pdf?

In [84]:
loc = 25
scale = 4
norm.pdf(10, loc, scale)


Out[84]:
8.8148920591861365e-05

In [85]:
x = np.linspace(norm.ppf(0.01, loc, scale), norm.ppf(0.99, loc, scale), 100)

In [86]:
norm.ppf(0.99, loc, scale)


Out[86]:
34.305391496163367

In [87]:
norm.ppf(0.01, loc, scale)


Out[87]:
15.694608503836637

In [93]:
plt.plot(x, norm.pdf(x, loc, scale), 'r-', lw=5, alpha=0.6)
plt.ylabel('PDF')
plt.xlabel('Age')


Out[93]:
Text(0.5,0,'Age')

In [98]:
ages = np.linspace(17, 90, 100)

In [102]:
loc


Out[102]:
25

In [181]:
threshold = 0.01
age_threshold = 0.01

def is_young(age):
    return norm.pdf(age, loc=20, scale=3)

def good_age(age):
    return norm.pdf(age, loc=45, scale=5)

def is_old(age):
    return norm.pdf(age, loc=90, scale=8)

plt.plot(ages, is_young(ages), 'r', lw=2, alpha=0.5)
plt.plot(ages, good_age(ages), 'g', lw=2, alpha=0.5)
plt.plot(ages, is_old(ages), 'r', lw=2, alpha=0.5)
plt.plot(ages, np.full(100, threshold), 'gray', lw=2, alpha=0.5)

plt.ylabel('PDF')
plt.xlabel('Age')


Out[181]:
Text(0.5,0,'Age')

In [128]:
norm.pdf(90, loc=90, scale=6)


Out[128]:
0.066490380066905455

In [148]:
def predict_on_age(age):
    if is_young(age) > threshold or is_old(age) > threshold:
        return 0
    if good_age(age) > threshold :
        return 1
    return 2

In [160]:
predict_on_age(32)


Out[160]:
2

In [205]:
kms = np.linspace(0, 100, 100)
kms_threshold = 0.005

def no_practice(km):
    return norm.pdf(km, loc=1, scale=3)

def much_driving(km):
    return norm.pdf(km, loc=100, scale=20)

def sweet_spot(km):
    return norm.pdf(km, loc=20, scale=5)

plt.plot(kms, no_practice(kms), 'r', lw=2, alpha=0.5)
plt.plot(kms, much_driving(kms), 'r', lw=2, alpha=0.5)
plt.plot(kms, sweet_spot(kms), 'g', lw=2, alpha=0.5)
plt.plot(kms, np.full(100, kms_threshold), 'gray', lw=2, alpha=0.5)

plt.ylabel('PDF')
plt.xlabel('thousand km per year')


Out[205]:
Text(0.5,0,'thousand km per year')

In [176]:
kmhs = np.linspace(90, 250, 100)
kmhs_threshold = 0.002

def too_fast(kmh):
    return norm.pdf(kmh, loc=250, scale=30)

plt.plot(kmhs, too_fast(kmhs), 'r', lw=2, alpha=0.5)
plt.plot(kmhs, np.full(100, kmhs_threshold), 'gray', lw=2, alpha=0.5)

plt.ylabel('PDF')
plt.xlabel('km/h')


Out[176]:
Text(0.5,0,'km/h')

In [187]:
def predict(x):
    speed, age, km_per_year = x
    if (is_young(age) > age_threshold or is_old(age) > age_threshold
        or too_fast(speed) > kmhs_threshold
        or no_practice(km_per_year) > kms_threshold or much_driving(km_per_year) > kms_threshold):
        return 0
    if good_age(age) > age_threshold or sweet_spot(km_per_year)  > kms_threshold:
        return 1
    return 2

In [197]:
predict([190, 47, 10])


Out[197]:
1

In [198]:
age_factor = 1
kmhs_factor = 1
kms_factor = 1

def scoring(x):
    speed, age, km_per_year = x
    pos_score = good_age(age) * age_factor + sweet_spot(km_per_year) * kms_factor
    neg_score = (is_young(age) * age_factor + is_old(age) * age_factor
        + too_fast(speed) * kmhs_factor
        + no_practice(km_per_year) * kms_factor + much_driving(km_per_year) * kms_factor)
    return pos_score - neg_score

In [200]:
scoring([190, 47, 10])


Out[200]:
0.08213760372927606

In [203]:
scoring([250, 87, 100])


Out[203]:
-0.079727076767173791

In [204]:
# as good as it gets
scoring([100, 47, 20])


Out[204]:
0.15343571647820856

In [206]:
# nightmare customer
scoring([250, 90, 1])


Out[206]:
-0.19608832714225796

In [207]:
# nightmare customer
scoring([180, 35, 25])


Out[207]:
0.05830014987666652

In [209]:
score_threshold = 0.005

def predict_for_score(x):
    score = scoring(x)
    if abs(score) < score_threshold:
        return 2
    if score < 0:
        return 1
    return 0

In [210]:
predict_for_score([190, 47, 10])


Out[210]:
0

In [211]:
predict_for_score([250, 87, 100])


Out[211]:
1

In [ ]:
scoring([100, 47, 20])