In [2]:
import pandas as pd

In [3]:
import statsmodels.formula.api as sm

In [4]:
iris=pd.read_csv("http://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv")

In [6]:
iris =iris.drop('Unnamed: 0', 1)

In [7]:
iris.head()


Out[7]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

In [15]:
iris.columns=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
       'Species']

In [16]:
iris.columns


Out[16]:
Index(['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
       'Species'],
      dtype='object')

In [17]:
result = sm.ols(formula="Sepal_Length  ~  Petal_Length  + Sepal_Width + Petal_Width + Species", data=iris)

In [18]:
result.fit()


Out[18]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x9bafe10>

In [19]:
result.fit().summary()


Out[19]:
OLS Regression Results
Dep. Variable: Sepal_Length R-squared: 0.867
Model: OLS Adj. R-squared: 0.863
Method: Least Squares F-statistic: 188.3
Date: Mon, 13 Mar 2017 Prob (F-statistic): 2.67e-61
Time: 17:56:48 Log-Likelihood: -32.558
No. Observations: 150 AIC: 77.12
Df Residuals: 144 BIC: 95.18
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 2.1713 0.280 7.760 0.000 1.618 2.724
Species[T.versicolor] -0.7236 0.240 -3.013 0.003 -1.198 -0.249
Species[T.virginica] -1.0235 0.334 -3.067 0.003 -1.683 -0.364
Petal_Length 0.8292 0.069 12.101 0.000 0.694 0.965
Sepal_Width 0.4959 0.086 5.761 0.000 0.326 0.666
Petal_Width -0.3152 0.151 -2.084 0.039 -0.614 -0.016
Omnibus: 0.418 Durbin-Watson: 1.966
Prob(Omnibus): 0.811 Jarque-Bera (JB): 0.572
Skew: -0.060 Prob(JB): 0.751
Kurtosis: 2.722 Cond. No. 94.0

In [20]:
result.fit().params


Out[20]:
Intercept                2.171266
Species[T.versicolor]   -0.723562
Species[T.virginica]    -1.023498
Petal_Length             0.829244
Sepal_Width              0.495889
Petal_Width             -0.315155
dtype: float64

In [23]:
result.fit().outlier_test(method='bonf', alpha=0.05)


Out[23]:
student_resid unadj_p bonf(p)
0 0.312689 0.754973 1.0
1 0.473016 0.636923 1.0
2 -0.240279 0.810458 1.0
3 -0.956277 0.340546 1.0
4 -0.178770 0.858371 1.0
5 0.036712 0.970765 1.0
6 -1.066895 0.287817 1.0
7 -0.125127 0.900599 1.0
8 -1.021792 0.308605 1.0
9 -0.068824 0.945226 1.0
10 0.703086 0.483145 1.0
11 -1.058507 0.291609 1.0
12 0.038433 0.969396 1.0
13 -0.792571 0.429341 1.0
14 2.431167 0.016288 1.0
15 0.778864 0.437347 1.0
16 1.142357 0.255215 1.0
17 0.416300 0.677815 1.0
18 1.089623 0.277712 1.0
19 -0.346136 0.729749 1.0
20 0.645498 0.519639 1.0
21 -0.078552 0.937498 1.0
22 -0.405720 0.685554 1.0
23 0.133053 0.894339 1.0
24 -1.904722 0.058824 1.0
25 0.255775 0.798492 1.0
26 -0.190906 0.848870 1.0
27 0.368833 0.712798 1.0
28 0.805113 0.422091 1.0
29 -1.063322 0.289428 1.0
... ... ... ...
120 0.540764 0.589512 1.0
121 -1.229878 0.220762 1.0
122 0.814912 0.416478 1.0
123 1.054229 0.293556 1.0
124 -0.491062 0.624135 1.0
125 0.191377 0.848501 1.0
126 0.833707 0.405837 1.0
127 -0.104393 0.917003 1.0
128 -0.388983 0.697868 1.0
129 0.878534 0.381128 1.0
130 1.353238 0.178115 1.0
131 0.653646 0.514390 1.0
132 -0.286276 0.775081 1.0
133 0.024680 0.980344 1.0
134 -1.876542 0.062619 1.0
135 2.474327 0.014518 1.0
136 -1.413807 0.159592 1.0
137 -0.924181 0.356949 1.0
138 -0.161519 0.871913 1.0
139 1.319446 0.189129 1.0
140 0.425193 0.671335 1.0
141 2.426542 0.016488 1.0
142 -1.048813 0.296034 1.0
143 -0.337894 0.735939 1.0
144 -0.077141 0.938619 1.0
145 1.606589 0.110351 1.0
146 1.215637 0.226126 1.0
147 0.602340 0.547902 1.0
148 -1.294716 0.197505 1.0
149 -1.321604 0.188411 1.0

150 rows × 3 columns


In [21]:
dir(result.fit())


Out[21]:
['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cache',
 '_data_attr',
 '_get_robustcov_results',
 '_is_nested',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_robustcov_results',
 'initialize',
 'k_constant',
 'llf',
 'load',
 'model',
 'mse_model',
 'mse_resid',
 'mse_total',
 'nobs',
 'normalized_cov_params',
 'outlier_test',
 'params',
 'predict',
 'pvalues',
 'remove_data',
 'resid',
 'resid_pearson',
 'rsquared',
 'rsquared_adj',
 'save',
 'scale',
 'ssr',
 'summary',
 'summary2',
 't_test',
 'tvalues',
 'uncentered_tss',
 'use_t',
 'wald_test',
 'wresid']

In [26]:
test=result.fit().outlier_test()
print ('Bad data points (bonf(p) < 0.05):')
print (test[test.icol(2) < 0.05])


Bad data points (bonf(p) < 0.05):
Empty DataFrame
Columns: [student_resid, unadj_p, bonf(p)]
Index: []
C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  app.launch_new_instance()

In [ ]: