In [14]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import pandas as pd

from tobit import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Recovers true coefficients on artificial censored regression data


In [15]:
rs = np.random.RandomState(seed=10)
ns = 100
nf = 10
x, y_orig, coef = make_regression(n_samples=ns, n_features=nf, coef=True, noise=0.0, random_state=rs)
x = pd.DataFrame(x)
y = pd.Series(y_orig)

In [16]:
n_quantiles = 3 # two-thirds of the data is truncated
quantile = 100/float(n_quantiles)
lower = np.percentile(y, quantile)
upper = np.percentile(y, (n_quantiles - 1) * quantile)
left = y < lower
right = y > upper
cens = pd.Series(np.zeros((ns,)))
cens[left] = -1
cens[right] = 1
y = y.clip(upper=upper, lower=lower)
hist = plt.hist(y)



In [17]:
tr = TobitModel()
result = tr.fit(x, y, cens, verbose=False)

In [18]:
fig, ax = plt.subplots()
ind = np.arange(len(coef))
width = 0.25
rects1 = ax.bar(ind, coef, width, color='g', label='True')
rects2 = ax.bar(ind + width, tr.coef_, width, color='r', label='Tobit')
rects3 = ax.bar(ind + (2 * width), tr.ols_coef_, width, color='b', label='OLS')
plt.ylabel("Coefficient")
plt.xlabel("Index of regressor")
plt.title("Tobit vs. OLS on censored data")
leg = plt.legend(loc=(0.22, 0.65))


Note that the truncation values do not have to be the same for e.g. all left-censored observations, or all right-censored observations, as in this example. However, the model does assume that the errors will be normally-distributed.

Comparison to R censReg package result on AER data

Commands in R for Tobit analysis of Affairs data:

install.packages('censReg')
library(censReg)
install.packages('AER')
data('Affairs', package='AER')
write.table(Affairs, 'tobit_data.txt', quote=FALSE, row.names=FALSE)
estResult <- censReg( affairs ~ age + yearsmarried + religiousness +occupation + rating, data = Affairs)
summary(estResult)

Python analysis of same data


In [19]:
data_file = 'tobit_data.txt'
df = pd.read_table(data_file, sep=' ')
df.loc[df.gender=='male', 'gender'] = 1
df.loc[df.gender=='female', 'gender'] = 0
df.loc[df.children=='yes', 'children'] = 1
df.loc[df.children=='no', 'children'] = 0
df = df.astype(float)
df.head()


Out[19]:
affairs gender age yearsmarried children religiousness education occupation rating
0 0 1 37 10.00 0 3 18 7 4
1 0 0 27 4.00 0 4 14 6 4
2 0 0 32 15.00 1 1 12 1 4
3 0 1 57 15.00 1 5 18 6 5
4 0 1 22 0.75 0 2 17 6 3

In [20]:
y = df.affairs
x = df.drop(['affairs', 'gender', 'education', 'children'], axis=1)
cens = pd.Series(np.zeros((len(y),)))
cens[y==0] = -1
cens.value_counts()


Out[20]:
-1    451
 0    150
dtype: int64

In [24]:
tr = TobitModel()
tr = tr.fit(x, y, cens, verbose=False)

In [23]:
tr.coef_


Out[23]:
array([-0.17933256,  0.55414179, -1.68622027,  0.32605329, -2.2849727 ])

Note that the coefficients are identical to those obtained in the R analysis.