# Lecture 7: Tobit model


## 1. Motivation
# Censored variables
# We call a variable “censored” when we can only observe the latent variable in a
# certain range.
#     • hours worked (negative amounts can not be observed)
#     • income (only when relevant for social incurance)
#     • bids in an auction (often only the second highest bid is observed)


## 2. Example
# Let us generate a simple censored variable:
  library(survival)
  set.seed(123)
  n = 120
  sd = 2
  x = sort(runif(n))
  y = x + sd * rnorm(n)
  ycens = y
  ycens[y <= 0] = 0
  ycens[y >= 1] = 1

# plot(y ~ x, col="grey")
# points(ycens[y <= 0] ~ x[y <= 0], col="red")
# points(ycens[y >= 1] ~ x[y >= 1], col="red")
# points(ycens[y >= 0 & y <= 1] ~ x[y >= 0 & y <= 1])
# abline(a=0, b=1)

# As with the logit model we can try to use simple OLS to estimate the
# relationship:
  olsall = lm(ycens ~ x); olsall

# This result is not too convincing, perhaps we should only look at the uncensored
# observations:
  olsrel = lm(ycens ~ x, subset = (ycens > 0 & ycens < 1)); olsrel

# The left part of the following picture shows the true relation as well as the two
# estimates:
par(mfrow = c(1, 3))
plot(ycens ~ x)
abline(a = 0, b = 1, lwd = 3, col = "blue")
abline(olsall, lty = 2, col = 2, lwd = 3)
abline(olsrel, lty = 3, col = "red", lwd = 3)
legend("topleft", c("true", "OLS", "OLS not censored"),
    lty = 1:3, col = c("blue", "red", "red"), bg = "white")
plot(ycens - x ~ x, main = "true residuals", ylab = "ycens-E(y)")
abline(h = 0)
lines(predict(olsall) - x ~ x, lty = 2, col = 2, lwd = 3)
legend("topright", c("true", "OLS"), col = 1:2, lty = 1:2)
plot(olsall, which = 1, main = "estimated residuals")

# The graph in the middle shows the true residuals, the graph on the right shows
# the estimated residuals. While the estimated residuals underestimate the
# problem, it is still visible: E(u|X) != 0 for most X.


## 3. Solution
# Interval regression
# As in the logistic case we use maximum likelihood. The procedure is called
# “interval regression” since the dependent variable is an interval.
#    • [y, y] — known observation
#    • [y, ∞] — observation is larger than y
#    • [∞, y] — observation is smaller than y
#    • [y1, y2] — observation is between y1 and y2

# In R we use missings (NA) to indicate ∞.
  ymin = ycens
  ymax = ycens
  ymin[ycens == 0] = NA
  ymax[ycens == 1] = NA
  intreg = survreg(Surv(ymin, ymax, type = "interval2") ~
      x, dist = "gaussian")
  intreg

 library(VGAM)
 ?tobit
 intreg2 <- vglm(ycens ~ x, tobit(Lower=0, Upper=1))
 intreg2
 coef(intreg2, matrix=TRUE)


# The estimated β is not perfect, but much better then the naïve OLS estimates
# above. Let us show the esimated regression line in a graph:

par(mfrow = c(1, 2))
plot(ycens ~ x)
abline(a = 0, b = 1, lwd = 3, col = "blue")
abline(olsall, olsrel, lty = 2, col = "red", lwd = 3)
abline(olsrel, lty = 3, col = "red", lwd = 3)
abline(intreg, lty = 4, col = "green", lwd = 3)
legend("topleft", c("true", "OLS", "OLS not censored",
    "interval"), lty = 1:4, col = c("blue", "red", "red",
    "green"), bg = "white")
plot(ycens - x ~ x, main = "true residuals", ylab = "ycens-E(y)")
abline(h = 0)
lines(predict(olsall) - x ~ x, lty = 2, col = 2, lwd = 3)
lines(predict(intreg) - x ~ x, lty = 3, col = 3, lwd = 3)
legend("topright", c("true", "OLS", "interval"), col = 1:3,
    lty = 1:3)

# The graph on the right side of the figure shows again the true residuals, i.e.
# y − E(y). We should note two things:
#   • The OLS estimate typically underestimates the relationship. Reason: The
#     extreme values are missing (censored)
#   • The Interval regression can overestimate. It is not necessarily very stable.