In [14]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import pandas as pd
from tobit import *
In [15]:
rs = np.random.RandomState(seed=10)
ns = 100
nf = 10
x, y_orig, coef = make_regression(n_samples=ns, n_features=nf, coef=True, noise=0.0, random_state=rs)
x = pd.DataFrame(x)
y = pd.Series(y_orig)
In [16]:
n_quantiles = 3 # two-thirds of the data is truncated
quantile = 100/float(n_quantiles)
lower = np.percentile(y, quantile)
upper = np.percentile(y, (n_quantiles - 1) * quantile)
left = y < lower
right = y > upper
cens = pd.Series(np.zeros((ns,)))
cens[left] = -1
cens[right] = 1
y = y.clip(upper=upper, lower=lower)
hist = plt.hist(y)
In [17]:
tr = TobitModel()
result = tr.fit(x, y, cens, verbose=False)
In [18]:
fig, ax = plt.subplots()
ind = np.arange(len(coef))
width = 0.25
rects1 = ax.bar(ind, coef, width, color='g', label='True')
rects2 = ax.bar(ind + width, tr.coef_, width, color='r', label='Tobit')
rects3 = ax.bar(ind + (2 * width), tr.ols_coef_, width, color='b', label='OLS')
plt.ylabel("Coefficient")
plt.xlabel("Index of regressor")
plt.title("Tobit vs. OLS on censored data")
leg = plt.legend(loc=(0.22, 0.65))
Note that the truncation values do not have to be the same for e.g. all left-censored observations, or all right-censored observations, as in this example. However, the model does assume that the errors will be normally-distributed.
install.packages('censReg')
library(censReg)
install.packages('AER')
data('Affairs', package='AER')
write.table(Affairs, 'tobit_data.txt', quote=FALSE, row.names=FALSE)
estResult <- censReg( affairs ~ age + yearsmarried + religiousness +occupation + rating, data = Affairs)
summary(estResult)
In [19]:
data_file = 'tobit_data.txt'
df = pd.read_table(data_file, sep=' ')
df.loc[df.gender=='male', 'gender'] = 1
df.loc[df.gender=='female', 'gender'] = 0
df.loc[df.children=='yes', 'children'] = 1
df.loc[df.children=='no', 'children'] = 0
df = df.astype(float)
df.head()
Out[19]:
In [20]:
y = df.affairs
x = df.drop(['affairs', 'gender', 'education', 'children'], axis=1)
cens = pd.Series(np.zeros((len(y),)))
cens[y==0] = -1
cens.value_counts()
Out[20]:
In [24]:
tr = TobitModel()
tr = tr.fit(x, y, cens, verbose=False)
In [23]:
tr.coef_
Out[23]:
Note that the coefficients are identical to those obtained in the R analysis.