In [14]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline

from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import pandas as pd

from tobit import *









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Recovers true coefficients on artificial censored regression data



In [15]:

    
rs = np.random.RandomState(seed=10)
ns = 100
nf = 10
x, y_orig, coef = make_regression(n_samples=ns, n_features=nf, coef=True, noise=0.0, random_state=rs)
x = pd.DataFrame(x)
y = pd.Series(y_orig)



In [16]:

    
n_quantiles = 3 # two-thirds of the data is truncated
quantile = 100/float(n_quantiles)
lower = np.percentile(y, quantile)
upper = np.percentile(y, (n_quantiles - 1) * quantile)
left = y < lower
right = y > upper
cens = pd.Series(np.zeros((ns,)))
cens[left] = -1
cens[right] = 1
y = y.clip(upper=upper, lower=lower)
hist = plt.hist(y)



In [17]:

    
tr = TobitModel()
result = tr.fit(x, y, cens, verbose=False)



In [18]:

    
fig, ax = plt.subplots()
ind = np.arange(len(coef))
width = 0.25
rects1 = ax.bar(ind, coef, width, color='g', label='True')
rects2 = ax.bar(ind + width, tr.coef_, width, color='r', label='Tobit')
rects3 = ax.bar(ind + (2 * width), tr.ols_coef_, width, color='b', label='OLS')
plt.ylabel("Coefficient")
plt.xlabel("Index of regressor")
plt.title("Tobit vs. OLS on censored data")
leg = plt.legend(loc=(0.22, 0.65))

Note that the truncation values do not have to be the same for e.g. all left-censored observations, or all right-censored observations, as in this example. However, the model does assume that the errors will be normally-distributed.

Comparison to R censReg package result on AER data

Commands in R for Tobit analysis of Affairs data:

install.packages('censReg')
library(censReg)
install.packages('AER')
data('Affairs', package='AER')
write.table(Affairs, 'tobit_data.txt', quote=FALSE, row.names=FALSE)
estResult <- censReg( affairs ~ age + yearsmarried + religiousness +occupation + rating, data = Affairs)
summary(estResult)

Python analysis of same data



In [19]:

    
data_file = 'tobit_data.txt'
df = pd.read_table(data_file, sep=' ')
df.loc[df.gender=='male', 'gender'] = 1
df.loc[df.gender=='female', 'gender'] = 0
df.loc[df.children=='yes', 'children'] = 1
df.loc[df.children=='no', 'children'] = 0
df = df.astype(float)
df.head()









    Out[19]:






  
    
      
      affairs
      gender
      age
      yearsmarried
      children
      religiousness
      education
      occupation
      rating
    
  
  
    
      0
       0
       1
       37
       10.00
       0
       3
       18
       7
       4
    
    
      1
       0
       0
       27
        4.00
       0
       4
       14
       6
       4
    
    
      2
       0
       0
       32
       15.00
       1
       1
       12
       1
       4
    
    
      3
       0
       1
       57
       15.00
       1
       5
       18
       6
       5
    
    
      4
       0
       1
       22
        0.75
       0
       2
       17
       6
       3



In [20]:

    
y = df.affairs
x = df.drop(['affairs', 'gender', 'education', 'children'], axis=1)
cens = pd.Series(np.zeros((len(y),)))
cens[y==0] = -1
cens.value_counts()









    Out[20]:





-1    451
 0    150
dtype: int64



In [24]:

    
tr = TobitModel()
tr = tr.fit(x, y, cens, verbose=False)



In [23]:

    
tr.coef_









    Out[23]:





array([-0.17933256,  0.55414179, -1.68622027,  0.32605329, -2.2849727 ])

Note that the coefficients are identical to those obtained in the R analysis.

	gender	age	yearsmarried	children	religiousness	education	occupation	rating
0	1	37	10.00	0	3	18	7	4
1	0	27	4.00	0	4	14	6	4
2	0	32	15.00	1	1	12	1	4
3	1	57	15.00	1	5	18	6	5
4	1	22	0.75	0	2	17	6	3