In [1]:
%pylab
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics as smg
import pandas as pd
from pandas.plotting import parallel_coordinates, scatter_matrix
from scipy import stats


Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib

Analyse de la variance (ANOVA)


In [2]:
F, p = stats.f_oneway([1,2,3],[5,6,7])
print(F, p)

F, p = stats.f_oneway([25.6636147577, 26.8147042254, 26.5087485812, 26.0110693572, 26.1982930499, 25.0162178218, 25.4738536463, 25.7626961169, 26.2413388405, 26.6684925808],
                      [26.9368238908, 26.7905458624, 26.0659696128, 25.8725323008, 26.67954654, 26.9751683032, 26.0701459549, 26.1627538932, 26.9750950622, 25.6773437008],
                      [27.0405788466, 26.7461246306, 28.1998587517, 26.180994282, 27.4699458762, 26.2699015397, 26.1013955748, 27.9366444862, 26.1518355511, 26.4466845405])
print(F, p)


24.0 0.008049893100837719
4.29126983118452 0.024083083839157738

Corrélation


In [3]:
# jeu de données de voitures http://lib.stat.cmu.edu/DASL/Datafiles/Cars.html
data = pd.read_csv('donnees/cars.txt')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 8 columns):
Country         38 non-null object
Car             38 non-null object
MPG             38 non-null float64
Weight          38 non-null float64
Drive_Ratio     38 non-null float64
Horsepower      38 non-null int64
Displacement    38 non-null int64
Cylinders       38 non-null int64
dtypes: float64(3), int64(3), object(2)
memory usage: 2.5+ KB

In [4]:
# coefficient de corrélation linéaire de Pearson
pearsonCorr = data.corr()
names = pearsonCorr.index.tolist()
pearsonCorr


Out[4]:
MPG Weight Drive_Ratio Horsepower Displacement Cylinders
MPG 1.000000 -0.903071 0.417225 -0.871282 -0.786048 -0.805511
Weight -0.903071 1.000000 -0.687880 0.917220 0.950765 0.916678
Drive_Ratio 0.417225 -0.687880 1.000000 -0.588906 -0.798273 -0.692150
Horsepower -0.871282 0.917220 -0.588906 1.000000 0.871799 0.863847
Displacement -0.786048 0.950765 -0.798273 0.871799 1.000000 0.940281
Cylinders -0.805511 0.916678 -0.692150 0.863847 0.940281 1.000000

In [5]:
plt.figure()
plt.matshow(pearsonCorr)
plt.clim(-1.,1.)
plt.colorbar()
plt.xticks(range(len(names)), names, rotation=90)
plt.yticks(range(len(names)), names)


Out[5]:
([<matplotlib.axis.YTick at 0x7fce33d8ea58>,
  <matplotlib.axis.YTick at 0x7fce33d8e390>,
  <matplotlib.axis.YTick at 0x7fce33d7a390>,
  <matplotlib.axis.YTick at 0x7fce33d13e10>,
  <matplotlib.axis.YTick at 0x7fce33d1b390>,
  <matplotlib.axis.YTick at 0x7fce33d1b908>],
 <a list of 6 Text yticklabel objects>)
<Figure size 432x288 with 0 Axes>

In [6]:
spearmanCorr = data.corr('spearman')
names = spearmanCorr.index.tolist()
spearmanCorr


Out[6]:
MPG Weight Drive_Ratio Horsepower Displacement Cylinders
MPG 1.000000 -0.937784 0.465140 -0.886450 -0.838471 -0.828359
Weight -0.937784 1.000000 -0.634491 0.907140 0.944804 0.896497
Drive_Ratio 0.465140 -0.634491 1.000000 -0.602754 -0.797872 -0.646733
Horsepower -0.886450 0.907140 -0.602754 1.000000 0.895518 0.847644
Displacement -0.838471 0.944804 -0.797872 0.895518 1.000000 0.904697
Cylinders -0.828359 0.896497 -0.646733 0.847644 0.904697 1.000000

In [7]:
plt.figure()
plt.matshow(spearmanCorr)
plt.clim(-1.,1.)
plt.colorbar()
plt.xticks(range(len(names)), names, rotation=90)
plt.yticks(range(len(names)), names)


Out[7]:
([<matplotlib.axis.YTick at 0x7fce33c995f8>,
  <matplotlib.axis.YTick at 0x7fce33c83ef0>,
  <matplotlib.axis.YTick at 0x7fce33ce20f0>,
  <matplotlib.axis.YTick at 0x7fce337afb38>,
  <matplotlib.axis.YTick at 0x7fce337b60f0>,
  <matplotlib.axis.YTick at 0x7fce337b6630>],
 <a list of 6 Text yticklabel objects>)
<Figure size 432x288 with 0 Axes>

In [8]:
pearsonCorr-spearmanCorr


Out[8]:
MPG Weight Drive_Ratio Horsepower Displacement Cylinders
MPG 0.000000 0.034713 -0.047915 0.015168 0.052423 0.022848
Weight 0.034713 0.000000 -0.053389 0.010081 0.005961 0.020180
Drive_Ratio -0.047915 -0.053389 0.000000 0.013847 -0.000401 -0.045417
Horsepower 0.015168 0.010081 0.013847 0.000000 -0.023719 0.016203
Displacement 0.052423 0.005961 -0.000401 -0.023719 0.000000 0.035584
Cylinders 0.022848 0.020180 -0.045417 0.016203 0.035584 0.000000

In [9]:
plt.figure()
plt.matshow(pearsonCorr-spearmanCorr)
plt.colorbar()
plt.xticks(range(len(names)), names, rotation=90)
plt.yticks(range(len(names)), names)


Out[9]:
([<matplotlib.axis.YTick at 0x7fce337816a0>,
  <matplotlib.axis.YTick at 0x7fce33773f60>,
  <matplotlib.axis.YTick at 0x7fce66241898>,
  <matplotlib.axis.YTick at 0x7fce336e6ba8>,
  <matplotlib.axis.YTick at 0x7fce336ec160>,
  <matplotlib.axis.YTick at 0x7fce336e6a20>],
 <a list of 6 Text yticklabel objects>)
<Figure size 432x288 with 0 Axes>

In [10]:
plt.figure()
scatter_matrix(data[names], diagonal='kde')
plt.figure()
plt.scatter(data.Weight, data.Horsepower)
plt.xlabel('Weight')
plt.ylabel('Horsepower')


Out[10]:
Text(0, 0.5, 'Horsepower')
<Figure size 432x288 with 0 Axes>

Régression linéaire


In [11]:
model = smf.ols('MPG ~ Horsepower', data = data)
results = model.fit()
print(results.summary())
plt.figure()
smg.regressionplots.plot_fit(results,1)
plt.figure()
#smg.regressionplots.plot_regress_exog(results, 'Horsepower')
plt.figure()
plt.hist(results.resid)
plt.title('test normalité: {}'.format(stats.normaltest(results.resid)))#, stats.jarque_bera(results.resid)
plt.figure()
plt.scatter(results.fittedvalues, results.resid)
plt.xlabel('Valeurs prédites')
plt.ylabel('Résidus')
plt.grid()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    MPG   R-squared:                       0.759
Model:                            OLS   Adj. R-squared:                  0.752
Method:                 Least Squares   F-statistic:                     113.5
Date:                Mon, 12 Aug 2019   Prob (F-statistic):           1.12e-12
Time:                        15:35:33   Log-Likelihood:                -97.770
No. Observations:                  38   AIC:                             199.5
Df Residuals:                      36   BIC:                             202.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     46.7066      2.127     21.959      0.000      42.393      51.020
Horsepower    -0.2157      0.020    -10.652      0.000      -0.257      -0.175
==============================================================================
Omnibus:                        0.998   Durbin-Watson:                   1.118
Prob(Omnibus):                  0.607   Jarque-Bera (JB):                0.678
Skew:                           0.326   Prob(JB):                        0.713
Kurtosis:                       2.943   Cond. No.                         423.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>

In [12]:
model = smf.ols('MPG ~ Country', data = data)
results = model.fit()
print(results.summary())

dataByCountry = []
for name, grouped in data.groupby(['Country']):
    dataByCountry.append(grouped['MPG'].tolist())
F, p = stats.f_oneway(*dataByCountry)
print(data[['Country','MPG']].groupby(['Country']).describe())
print('Résultats de l\'ANOVA',F, p)


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    MPG   R-squared:                       0.347
Model:                            OLS   Adj. R-squared:                  0.245
Method:                 Least Squares   F-statistic:                     3.406
Date:                Mon, 12 Aug 2019   Prob (F-statistic):             0.0140
Time:                        15:35:33   Log-Likelihood:                -116.71
No. Observations:                  38   AIC:                             245.4
Df Residuals:                      32   BIC:                             255.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
======================================================================================
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             16.2000      5.688      2.848      0.008       4.615      27.785
Country[T.Germany]    10.9400      6.230      1.756      0.089      -1.751      23.631
Country[T.Italy]      21.1000      8.043      2.623      0.013       4.716      37.484
Country[T.Japan]      13.4000      6.080      2.204      0.035       1.015      25.785
Country[T.Sweden]      3.1000      6.966      0.445      0.659     -11.089      17.289
Country[T.U.S.]        6.7955      5.815      1.169      0.251      -5.050      18.641
==============================================================================
Omnibus:                        4.233   Durbin-Watson:                   1.131
Prob(Omnibus):                  0.120   Jarque-Bera (JB):                2.189
Skew:                           0.314   Prob(JB):                        0.335
Kurtosis:                       2.006   Cond. No.                         18.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
          MPG                                                      
        count       mean       std   min     25%   50%    75%   max
Country                                                            
France    1.0  16.200000       NaN  16.2  16.200  16.2  16.20  16.2
Germany   5.0  27.140000  5.734806  20.3  21.500  30.5  31.50  31.9
Italy     1.0  37.300000       NaN  37.3  37.300  37.3  37.30  37.3
Japan     7.0  29.600000  4.532843  22.0  27.350  29.5  32.95  35.1
Sweden    2.0  19.300000  3.252691  17.0  18.150  19.3  20.45  21.6
U.S.     22.0  22.995455  6.054237  15.5  18.125  20.7  28.15  34.2
Résultats de l'ANOVA 3.406370053647239 0.014029549506410841

In [13]:
# exemple de création des variables binaires correspondantes
for c in data.Country.unique():
    data[c] = (data['Country'] == c)
data['US']=data['U.S.']

model = smf.ols('MPG ~ Germany + Italy + Japan + Sweden + US', data = data)
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    MPG   R-squared:                       0.347
Model:                            OLS   Adj. R-squared:                  0.245
Method:                 Least Squares   F-statistic:                     3.406
Date:                Mon, 12 Aug 2019   Prob (F-statistic):             0.0140
Time:                        15:35:33   Log-Likelihood:                -116.71
No. Observations:                  38   AIC:                             245.4
Df Residuals:                      32   BIC:                             255.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          16.2000      5.688      2.848      0.008       4.615      27.785
Germany[T.True]    10.9400      6.230      1.756      0.089      -1.751      23.631
Italy[T.True]      21.1000      8.043      2.623      0.013       4.716      37.484
Japan[T.True]      13.4000      6.080      2.204      0.035       1.015      25.785
Sweden[T.True]      3.1000      6.966      0.445      0.659     -11.089      17.289
US[T.True]          6.7955      5.815      1.169      0.251      -5.050      18.641
==============================================================================
Omnibus:                        4.233   Durbin-Watson:                   1.131
Prob(Omnibus):                  0.120   Jarque-Bera (JB):                2.189
Skew:                           0.314   Prob(JB):                        0.335
Kurtosis:                       2.006   Cond. No.                         18.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [14]:
model = smf.ols('MPG ~ Weight + Drive_Ratio + Horsepower + Displacement + Cylinders', data = data) # data = data[1:]
#C(Cylinders)
results = model.fit()
print(results.summary())
plt.figure()
smg.regressionplots.plot_fit(results,1)
plt.figure()
#smg.regressionplots.plot_regress_exog(results, 'Horsepower')
plt.figure()
plt.hist(results.resid)
plt.title('test normalité: {}'.format(stats.normaltest(results.resid)))#, stats.jarque_bera(results.resid)
plt.figure()
plt.scatter(results.fittedvalues, results.resid)
plt.xlabel('Valeurs prédites')
plt.ylabel('Résidus')
plt.grid()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    MPG   R-squared:                       0.907
Model:                            OLS   Adj. R-squared:                  0.893
Method:                 Least Squares   F-statistic:                     62.48
Date:                Mon, 12 Aug 2019   Prob (F-statistic):           1.45e-15
Time:                        15:35:33   Log-Likelihood:                -79.673
No. Observations:                  38   AIC:                             171.3
Df Residuals:                      32   BIC:                             181.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       69.2205      4.626     14.963      0.000      59.797      78.644
Weight         -11.3769      2.033     -5.596      0.000     -15.518      -7.236
Drive_Ratio     -3.3454      1.271     -2.631      0.013      -5.935      -0.756
Horsepower      -0.0448      0.034     -1.302      0.202      -0.115       0.025
Displacement     0.0332      0.020      1.650      0.109      -0.008       0.074
Cylinders       -0.5318      0.686     -0.775      0.444      -1.930       0.866
==============================================================================
Omnibus:                        9.741   Durbin-Watson:                   1.498
Prob(Omnibus):                  0.008   Jarque-Bera (JB):                8.932
Skew:                           0.958   Prob(JB):                       0.0115
Kurtosis:                       4.402   Cond. No.                     3.04e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.04e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>

In [15]:
model = smf.ols('MPG ~ Weight + Drive_Ratio', data = data)
#C(Cylinders)
results = model.fit()
print(results.summary())
plt.figure()
plt.scatter(results.fittedvalues, results.resid)
plt.xlabel('Valeurs prédites')
plt.ylabel('Résidus')
plt.grid()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    MPG   R-squared:                       0.895
Model:                            OLS   Adj. R-squared:                  0.888
Method:                 Least Squares   F-statistic:                     148.4
Date:                Mon, 12 Aug 2019   Prob (F-statistic):           8.05e-18
Time:                        15:35:34   Log-Likelihood:                -82.082
No. Observations:                  38   AIC:                             170.2
Df Residuals:                      35   BIC:                             175.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      70.9191      4.590     15.450      0.000      61.600      80.238
Weight        -10.8315      0.701    -15.461      0.000     -12.254      -9.409
Drive_Ratio    -4.8972      0.957     -5.119      0.000      -6.839      -2.955
==============================================================================
Omnibus:                       10.133   Durbin-Watson:                   1.450
Prob(Omnibus):                  0.006   Jarque-Bera (JB):                9.293
Skew:                           1.020   Prob(JB):                      0.00960
Kurtosis:                       4.306   Cond. No.                         57.8
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Modèles de choix discret


In [16]:
# jeux de donnees cars
data['MPG0'] = data['MPG']>data['MPG'].median()
#data.describe()
data[['Country','Car','MPG0']].describe()


Out[16]:
Country Car MPG0
count 38 38 38
unique 6 38 2
top U.S. Pontiac Phoenix True
freq 22 1 19

In [17]:
model = sm.Logit(data['MPG0'], data[['Weight', 'Drive_Ratio', 'Horsepower', 'Displacement', 'Cylinders']])
results = model.fit()
print(results.summary())


Optimization terminated successfully.
         Current function value: 0.396328
         Iterations 7
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   MPG0   No. Observations:                   38
Model:                          Logit   Df Residuals:                       33
Method:                           MLE   Df Model:                            4
Date:                Mon, 12 Aug 2019   Pseudo R-squ.:                  0.4282
Time:                        15:35:34   Log-Likelihood:                -15.060
converged:                       True   LL-Null:                       -26.340
Covariance Type:            nonrobust   LLR p-value:                 0.0001551
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Weight          -3.7841      2.946     -1.284      0.199      -9.559       1.990
Drive_Ratio      2.9777      1.331      2.238      0.025       0.370       5.586
Horsepower      -0.0183      0.040     -0.461      0.645      -0.096       0.059
Displacement     0.0313      0.031      1.001      0.317      -0.030       0.093
Cylinders       -0.4158      0.831     -0.501      0.617      -2.044       1.213
================================================================================

In [18]:
model = sm.MNLogit(data['Country'], data[['Weight', 'Horsepower', 'Displacement', 'Cylinders']])
results = model.fit()
print(results.summary())


Optimization terminated successfully.
         Current function value: 0.546565
         Iterations 13
                          MNLogit Regression Results                          
==============================================================================
Dep. Variable:                Country   No. Observations:                   38
Model:                        MNLogit   Df Residuals:                       18
Method:                           MLE   Df Model:                           15
Date:                Mon, 12 Aug 2019   Pseudo R-squ.:                  0.5597
Time:                        15:35:34   Log-Likelihood:                -20.769
converged:                       True   LL-Null:                       -47.170
Covariance Type:            nonrobust   LLR p-value:                 4.159e-06
================================================================================
Country=Germany       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Weight              2.1390      8.854      0.242      0.809     -15.215      19.493
Horsepower          0.1004      0.212      0.474      0.635      -0.315       0.515
Displacement       -0.2161      0.209     -1.032      0.302      -0.626       0.194
Cylinders           2.5824      3.164      0.816      0.414      -3.619       8.784
-----------------------------------------------------------------------------------
Country=Italy       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Weight           61.7992     86.821      0.712      0.477    -108.367     231.965
Horsepower       -0.3700      0.419     -0.883      0.377      -1.191       0.451
Displacement     -0.9472      1.460     -0.649      0.517      -3.810       1.915
Cylinders        -4.2652      8.861     -0.481      0.630     -21.632      13.102
---------------------------------------------------------------------------------
Country=Japan       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Weight           -1.8525      9.261     -0.200      0.841     -20.005      16.300
Horsepower       -0.2517      0.214     -1.174      0.240      -0.672       0.169
Displacement      0.1570      0.201      0.783      0.434      -0.236       0.550
Cylinders         2.3481      3.121      0.752      0.452      -3.769       8.466
---------------------------------------------------------------------------------
Country=Sweden       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Weight            -1.8174      9.931     -0.183      0.855     -21.281      17.646
Horsepower         0.0722      0.216      0.335      0.737      -0.350       0.495
Displacement      -0.0024      0.190     -0.012      0.990      -0.375       0.370
Cylinders         -0.4283      3.404     -0.126      0.900      -7.099       6.243
----------------------------------------------------------------------------------
Country=U.S.       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Weight          -0.0638      9.800     -0.007      0.995     -19.271      19.143
Horsepower      -0.6874      0.389     -1.768      0.077      -1.449       0.075
Displacement     0.4818      0.292      1.650      0.099      -0.090       1.054
Cylinders        0.8002      3.455      0.232      0.817      -5.971       7.572
================================================================================

In [21]:
# jeu de donnees autos
autos = pd.read_csv('donnees/autos.txt', delimiter='\t')
autos.describe()
autos.info()
autos.symboling.unique()
#autos['fuel-type'].unique()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
normalized-losses    205 non-null float64
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null float64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null float64
fuel-system          205 non-null object
bore                 205 non-null float64
stroke               205 non-null float64
compression-ratio    205 non-null float64
horsepower           205 non-null float64
peak-rpm             205 non-null float64
city-mpg             205 non-null float64
highway-mpg          205 non-null float64
price                205 non-null float64
symboling            205 non-null object
dtypes: float64(15), object(11)
memory usage: 41.7+ KB
Out[21]:
array(['_3', '_1', '_2', '_0', '_-1', '_-2'], dtype=object)

In [22]:
model = sm.MNLogit(autos['symboling'], autos[['price', 'length', 'width', 'height', 'engine-size']])
results = model.fit()
print(results.summary())


Optimization terminated successfully.
         Current function value: 1.267403
         Iterations 9
                          MNLogit Regression Results                          
==============================================================================
Dep. Variable:              symboling   No. Observations:                  205
Model:                        MNLogit   Df Residuals:                      180
Method:                           MLE   Df Model:                           20
Date:                Mon, 12 Aug 2019   Pseudo R-squ.:                  0.1954
Time:                        15:35:59   Log-Likelihood:                -259.82
converged:                       True   LL-Null:                       -322.91
Covariance Type:            nonrobust   LLR p-value:                 2.022e-17
================================================================================
symboling=_-2       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
price         -5.754e-05      0.000     -0.331      0.741      -0.000       0.000
length            0.1874      0.126      1.492      0.136      -0.059       0.434
width            -0.4363      0.398     -1.095      0.273      -1.217       0.344
height           -0.0955      0.463     -0.206      0.836      -1.002       0.811
engine-size      -0.0092      0.033     -0.282      0.778      -0.073       0.055
---------------------------------------------------------------------------------
symboling=_0       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
price        -5.311e-05   6.03e-05     -0.881      0.379      -0.000    6.51e-05
length          -0.0562      0.048     -1.181      0.238      -0.150       0.037
width            0.1517      0.142      1.072      0.284      -0.126       0.429
height           0.0138      0.138      0.100      0.920      -0.258       0.285
engine-size      0.0088      0.013      0.695      0.487      -0.016       0.034
--------------------------------------------------------------------------------
symboling=_1       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
price        -4.298e-05   7.61e-05     -0.565      0.572      -0.000       0.000
length          -0.1926      0.055     -3.471      0.001      -0.301      -0.084
width            0.8120      0.170      4.769      0.000       0.478       1.146
height          -0.3102      0.160     -1.943      0.052      -0.623       0.003
engine-size     -0.0136      0.017     -0.811      0.418      -0.046       0.019
--------------------------------------------------------------------------------
symboling=_2       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
price        -9.789e-05   8.41e-05     -1.164      0.245      -0.000     6.7e-05
length          -0.1620      0.058     -2.797      0.005      -0.275      -0.048
width            0.5453      0.171      3.190      0.001       0.210       0.880
height          -0.1189      0.162     -0.736      0.462      -0.436       0.198
engine-size      0.0049      0.018      0.272      0.786      -0.030       0.040
--------------------------------------------------------------------------------
symboling=_3       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
price            0.0001   7.32e-05      1.476      0.140   -3.54e-05       0.000
length          -0.1233      0.063     -1.968      0.049      -0.246      -0.000
width            1.0313      0.197      5.244      0.000       0.646       1.417
height          -0.8195      0.196     -4.190      0.000      -1.203      -0.436
engine-size     -0.0326      0.017     -1.954      0.051      -0.065    9.34e-05
================================================================================

In [23]:
# Travel Mode Choice http://www.statsmodels.org/stable/datasets/generated/modechoice.html
# mode =
#        1 - air
#        2 - train
#        3 - bus
#        4 - car
modechoices = sm.datasets.modechoice.load_pandas()
#print(modechoices.data.info())
#modechoices.data.describe()
data = modechoices.data[modechoices.data['choice'] == 1.].copy()
#print(data.info(), data['mode'])
#parallel_coordinates(data, 'mode')
#res=scatter_matrix(data[['mode', 'ttme', 'invc', 'invt', 'gc', 'hinc', 'psize']], diagonal='kde')

In [24]:
print(data.corr())
data['car'] = (data['mode'] == 4)
model = sm.Logit(data['car'], data[['invc', 'hinc', 'psize']])
results = model.fit()
print(results.summary())


            individual      mode  choice      ttme      invc      invt  \
individual    1.000000  0.104975     NaN -0.095073 -0.010584  0.005586   
mode          0.104975  1.000000     NaN -0.693467 -0.751150  0.470729   
choice             NaN       NaN     NaN       NaN       NaN       NaN   
ttme         -0.095073 -0.693467     NaN  1.000000  0.630972 -0.224208   
invc         -0.010584 -0.751150     NaN  0.630972  1.000000 -0.378400   
invt          0.005586  0.470729     NaN -0.224208 -0.378400  1.000000   
gc           -0.005248 -0.195573     NaN  0.318304  0.502565  0.576185   
hinc          0.051693  0.071394     NaN -0.061134  0.114834 -0.125492   
psize         0.046910  0.210923     NaN -0.142706 -0.210883  0.146518   

                  gc      hinc     psize  
individual -0.005248  0.051693  0.046910  
mode       -0.195573  0.071394  0.210923  
choice           NaN       NaN       NaN  
ttme        0.318304 -0.061134 -0.142706  
invc        0.502565  0.114834 -0.210883  
invt        0.576185 -0.125492  0.146518  
gc          1.000000  0.034101  0.115334  
hinc        0.034101  1.000000  0.180963  
psize       0.115334  0.180963  1.000000  
Optimization terminated successfully.
         Current function value: 0.274111
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                    car   No. Observations:                  210
Model:                          Logit   Df Residuals:                      207
Method:                           MLE   Df Model:                            2
Date:                Mon, 12 Aug 2019   Pseudo R-squ.:                  0.5384
Time:                        15:35:59   Log-Likelihood:                -57.563
converged:                       True   LL-Null:                       -124.71
Covariance Type:            nonrobust   LLR p-value:                 6.905e-30
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
invc          -0.1237      0.019     -6.517      0.000      -0.161      -0.087
hinc           0.0554      0.013      4.181      0.000       0.029       0.081
psize          0.3623      0.220      1.643      0.100      -0.070       0.794
==============================================================================

Possibly complete quasi-separation: A fraction 0.13 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.

In [25]:
model = sm.MNLogit(modechoices.endog, modechoices.exog)
results = model.fit()
print(results.summary())


Optimization terminated successfully.
         Current function value: 0.499751
         Iterations 6
                          MNLogit Regression Results                          
==============================================================================
Dep. Variable:                 choice   No. Observations:                  840
Model:                        MNLogit   Df Residuals:                      834
Method:                           MLE   Df Model:                            5
Date:                Mon, 12 Aug 2019   Pseudo R-squ.:                  0.1113
Time:                        15:35:59   Log-Likelihood:                -419.79
converged:                       True   LL-Null:                       -472.36
Covariance Type:            nonrobust   LLR p-value:                 4.350e-21
==============================================================================
  choice=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ttme          -0.0434      0.005     -8.972      0.000      -0.053      -0.034
invc           0.0028      0.009      0.321      0.748      -0.014       0.020
invt          -0.0038      0.001     -3.476      0.001      -0.006      -0.002
gc             0.0163      0.009      1.910      0.056      -0.000       0.033
hinc          -0.0014      0.004     -0.332      0.740      -0.010       0.007
psize          0.1029      0.092      1.114      0.265      -0.078       0.284
==============================================================================

In [26]:
# autres? modele logit binaire (probabilite de marcher) et multinomial (choix de mode / gravite des accidents)