In [2]:
import pandas as pd
In [3]:
import statsmodels.formula.api as sm
In [4]:
iris=pd.read_csv("http://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv")
In [6]:
iris =iris.drop('Unnamed: 0', 1)
In [7]:
iris.head()
Out[7]:
Sepal.Length
Sepal.Width
Petal.Length
Petal.Width
Species
0
5.1
3.5
1.4
0.2
setosa
1
4.9
3.0
1.4
0.2
setosa
2
4.7
3.2
1.3
0.2
setosa
3
4.6
3.1
1.5
0.2
setosa
4
5.0
3.6
1.4
0.2
setosa
In [15]:
iris.columns=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
'Species']
In [16]:
iris.columns
Out[16]:
Index(['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
'Species'],
dtype='object')
In [17]:
result = sm.ols(formula="Sepal_Length ~ Petal_Length + Sepal_Width + Petal_Width + Species", data=iris)
In [18]:
result.fit()
Out[18]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x9bafe10>
In [19]:
result.fit().summary()
Out[19]:
OLS Regression Results
Dep. Variable: Sepal_Length R-squared: 0.867
Model: OLS Adj. R-squared: 0.863
Method: Least Squares F-statistic: 188.3
Date: Mon, 13 Mar 2017 Prob (F-statistic): 2.67e-61
Time: 17:56:48 Log-Likelihood: -32.558
No. Observations: 150 AIC: 77.12
Df Residuals: 144 BIC: 95.18
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 2.1713 0.280 7.760 0.000 1.618 2.724
Species[T.versicolor] -0.7236 0.240 -3.013 0.003 -1.198 -0.249
Species[T.virginica] -1.0235 0.334 -3.067 0.003 -1.683 -0.364
Petal_Length 0.8292 0.069 12.101 0.000 0.694 0.965
Sepal_Width 0.4959 0.086 5.761 0.000 0.326 0.666
Petal_Width -0.3152 0.151 -2.084 0.039 -0.614 -0.016
Omnibus: 0.418 Durbin-Watson: 1.966
Prob(Omnibus): 0.811 Jarque-Bera (JB): 0.572
Skew: -0.060 Prob(JB): 0.751
Kurtosis: 2.722 Cond. No. 94.0
In [20]:
result.fit().params
Out[20]:
Intercept 2.171266
Species[T.versicolor] -0.723562
Species[T.virginica] -1.023498
Petal_Length 0.829244
Sepal_Width 0.495889
Petal_Width -0.315155
dtype: float64
In [23]:
result.fit().outlier_test(method='bonf', alpha=0.05)
Out[23]:
student_resid
unadj_p
bonf(p)
0
0.312689
0.754973
1.0
1
0.473016
0.636923
1.0
2
-0.240279
0.810458
1.0
3
-0.956277
0.340546
1.0
4
-0.178770
0.858371
1.0
5
0.036712
0.970765
1.0
6
-1.066895
0.287817
1.0
7
-0.125127
0.900599
1.0
8
-1.021792
0.308605
1.0
9
-0.068824
0.945226
1.0
10
0.703086
0.483145
1.0
11
-1.058507
0.291609
1.0
12
0.038433
0.969396
1.0
13
-0.792571
0.429341
1.0
14
2.431167
0.016288
1.0
15
0.778864
0.437347
1.0
16
1.142357
0.255215
1.0
17
0.416300
0.677815
1.0
18
1.089623
0.277712
1.0
19
-0.346136
0.729749
1.0
20
0.645498
0.519639
1.0
21
-0.078552
0.937498
1.0
22
-0.405720
0.685554
1.0
23
0.133053
0.894339
1.0
24
-1.904722
0.058824
1.0
25
0.255775
0.798492
1.0
26
-0.190906
0.848870
1.0
27
0.368833
0.712798
1.0
28
0.805113
0.422091
1.0
29
-1.063322
0.289428
1.0
...
...
...
...
120
0.540764
0.589512
1.0
121
-1.229878
0.220762
1.0
122
0.814912
0.416478
1.0
123
1.054229
0.293556
1.0
124
-0.491062
0.624135
1.0
125
0.191377
0.848501
1.0
126
0.833707
0.405837
1.0
127
-0.104393
0.917003
1.0
128
-0.388983
0.697868
1.0
129
0.878534
0.381128
1.0
130
1.353238
0.178115
1.0
131
0.653646
0.514390
1.0
132
-0.286276
0.775081
1.0
133
0.024680
0.980344
1.0
134
-1.876542
0.062619
1.0
135
2.474327
0.014518
1.0
136
-1.413807
0.159592
1.0
137
-0.924181
0.356949
1.0
138
-0.161519
0.871913
1.0
139
1.319446
0.189129
1.0
140
0.425193
0.671335
1.0
141
2.426542
0.016488
1.0
142
-1.048813
0.296034
1.0
143
-0.337894
0.735939
1.0
144
-0.077141
0.938619
1.0
145
1.606589
0.110351
1.0
146
1.215637
0.226126
1.0
147
0.602340
0.547902
1.0
148
-1.294716
0.197505
1.0
149
-1.321604
0.188411
1.0
150 rows × 3 columns
In [21]:
dir(result.fit())
Out[21]:
['HC0_se',
'HC1_se',
'HC2_se',
'HC3_se',
'_HCCM',
'__class__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__le__',
'__lt__',
'__module__',
'__ne__',
'__new__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__str__',
'__subclasshook__',
'__weakref__',
'_cache',
'_data_attr',
'_get_robustcov_results',
'_is_nested',
'_wexog_singular_values',
'aic',
'bic',
'bse',
'centered_tss',
'compare_f_test',
'compare_lm_test',
'compare_lr_test',
'condition_number',
'conf_int',
'conf_int_el',
'cov_HC0',
'cov_HC1',
'cov_HC2',
'cov_HC3',
'cov_kwds',
'cov_params',
'cov_type',
'df_model',
'df_resid',
'eigenvals',
'el_test',
'ess',
'f_pvalue',
'f_test',
'fittedvalues',
'fvalue',
'get_influence',
'get_robustcov_results',
'initialize',
'k_constant',
'llf',
'load',
'model',
'mse_model',
'mse_resid',
'mse_total',
'nobs',
'normalized_cov_params',
'outlier_test',
'params',
'predict',
'pvalues',
'remove_data',
'resid',
'resid_pearson',
'rsquared',
'rsquared_adj',
'save',
'scale',
'ssr',
'summary',
'summary2',
't_test',
'tvalues',
'uncentered_tss',
'use_t',
'wald_test',
'wresid']
In [26]:
test=result.fit().outlier_test()
print ('Bad data points (bonf(p) < 0.05):')
print (test[test.icol(2) < 0.05])
Bad data points (bonf(p) < 0.05):
Empty DataFrame
Columns: [student_resid, unadj_p, bonf(p)]
Index: []
C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
app.launch_new_instance()
In [ ]:
Content source: decisionstats/pythonfordatascience
Similar notebooks: