# -------------------------------------------------------------------
# MPO1-A Advanced Quantitative Research Methods
# Lecture 5: Endogeneity and IV

# Libraries: lmtest, sandwich, AER, systemfit
  source("http://klein.uk/R/myfunctions.R")
  ls()
# -------------------------------------------------------------------



# --- Ex 2: IV regression, Sargan and Hausman test (2) -----------------------------
 mroz <- read.csv("http://klein.uk/R/Lent/mroz.csv", header=T)
 str(mroz)

# --- Ex 2: a)  ---
# Graph a histogram for wage and lwage, and comment on the shape of the distribution.

 table(ifelse(mroz$hours>0,1,0))
 # There are a total of 753 women in the sample of which 428 work.
 # Note that log-wages are missing for those women that don't work.

 par(mfrow=c(1,2))
 hist(mroz$wage)
 # The distribution of wages is clearly skewed to the right, a large tail of high 
 # wages, with a maximum of $25. Remember: the mean is $4.18.

 hist(mroz$lwage)
 # The distribution of log wages is more symmetric. Wages are often lognormally
 # distributed, which means that log wages are normally distributed. These are
 # typical patterns for wages.


# --- Ex 2: b)  ---
# For the remainder of this exercise, retain only the women that work in the data set.

 mroz.w <- subset(mroz, hours>0)


# --- Ex 2: c)  ---
# Estimate the model: lwage ~ educ + exper + I(exper^2). How would you interpret 
# the coefficients? Does estimated values make sense? Which kind of problems may 
# these present?

 lm2c.ols <- lm(lwage ~ educ + exper + I(exper^2), data=mroz.w); shccm(lm2c.ols)
 # The OLS coeffient on educ suggests an 11% return. The average return to log
 # wages of an extra year of experience is given by 0.042-(0.0008^2)*exper = 0.021 
 # at the mean of exper (=13.03), or a 2.5% increase in wages for an additional year 
 # of experience at that level.

 # Omitted ability is probably biasing the results upwards. Ability is correlated
 # with education but it is not in the regression (then is in the error term). Then 
 # educ is correlated with the error term. In other words educ is endogenous.


# --- Ex 2: d)  ---
# Estimate the model by manual 2SLS (do not use ivreg yet), using fatheduc and 
# motheduc as instruments for educ. How do the results differ from those obtained 
# with educ?

 ## 1st stage regression:
 lm2d.1 <- lm(educ ~ fatheduc + motheduc, data=mroz.w)
 #lm2d.1 <- lm(educ ~ fatheduc + motheduc + exper + I(exper^2), data=mroz.w)
 # fatheduc and motheduc both explain educ, the reduced form coefficients are significant
 ## 2nd stage:
 # lm2d.1$fitted acts as an instrumental variable for educ.
 lm2d.2 <- lm(lwage ~ lm2d.1$fitted + exper + I(exper^2), data=mroz.w)
 # There is a reduction in the return to education to 6%. This was expected if the
 # effect was to collect the bias created by the omitted variable ability, as the 
 # bias with the OLS regression should be positive. Of course, other IVs could
 # have delivered different estimations of the rate of return. If this were the
 # case other explanations should be presented. These could be, for instance, that 
 # IV captures some error in measurement or others.

 library(lmtest)
 library(sandwich)
 coeftest(lm2d.2, vcov=hc0)
 # compare to 2SLS to OLS:
 coeftest(lm2c.ols,vcov=hc0)


# --- Ex 2: e)  ---
# Do fatheduc and motheduc explain educ? Are these good instruments?

 shccm(lm2d.1)


# --- Ex 2: f)  ---
# A problem with 2SLS is that the standard errors in the second stage are wrong. 
# The correct standard errors are provided by the ivreg command in R. 
# Comment on the differences.

 library(AER)
 lm2f.iv <- ivreg(lwage ~ educ + exper + I(exper^2)
        | exper + I(exper^2) + fatheduc + motheduc, data=mroz.w)
 coeftest(lm2f.iv, vcov=hc0)    # errors are different for ivreg and manual 2SLS!
 # compare to manual 2SLS estimates and OLS:
 coeftest(lm2d.2, vcov=hc0)
 coeftest(lm2c.ols,vcov=hc0)
 # There are only very minor changes in the standard errors.
 # Note that the se for educ is quite a bit larger than the OLS one.


# --- Ex 2: g)  ---
# Perform the test for overidentifying restrictions, i.e.: whether the instruments 
# are uncorrelated with u.

 # Sargan test from Stock and Watson (2007)
 #lm2g.or <- lm(lm2f.iv$res ~ exper + I(exper^2) + fatheduc + motheduc, data=mroz.w)
 #lm2g.or.test <- linearHypothesis(lm2g.or, c("fatheduc=0", "motheduc=0"), test="Chisq", vcov=hc0)
 ## WARNING: df (and hence p-value) invalid above.
 ## correct df: # instruments - # endogenous variables
 #pchisq(q=lm2g.or.test[2,3], df = lm2f.iv$df.res - lm2g.or$df.res, lower.tail=F)

 ## Sargan LM test from handout, Lab Session 3
 ## Step 2. Auxiliary regression:
 lm.aux <- lm(lm2f.iv$res ~ exper + I(exper^2) + fatheduc + motheduc, data=mroz.w)
 R2 <- summary(lm.aux)$r.squared
 LM <- length(lm.aux$res) * R2
 # LM is chi.sqr_{m-k} m = no. instruments = 5; k = no. regressors = 4
 pchisq(q = LM, df = 5-4, lower.tail=F)

 # The p-values are 0.5088 and 0.5386 respectively. We don't reject that the 
 # instruments are valid in that they are not correlated with the errors.


# --- Ex 2: h)  ---
# Test for endogeneity of educ.

 ## Durbin-Wu-Hausman test from handout, Lab Session 3
 ## Step 1. Create OLS residuals. Already done! 
 e <- lm2c.ols$res
 ## Step 2. Regress every endogenous regressor (in this case only educ) 
 ## on z (constant, fatheduc, motheduc, exper, expersq) and obtain residuals. Done!
 lm2h.1 <- lm(educ ~ fatheduc + motheduc + exper + I(exper^2), data=mroz.w)
 v <- lm2h.1$res
 ## Step 3.
 dwh3 <- lm(e ~ educ + exper + expersq + v, data=mroz.w); shccm(dwh3)
 ## Lagrange Multiplier Statistic:
 LM <- length(dwh3$res) * summary(dwh3)$r.squared
 # is Chi^2_{k-k0} with  
 # k = no. regressors = 2; 
 # k0 = no. regressors potentially endogenous = 1.
 # H0: All regressors are exogenous. p-value of the LM statistic:
 pchisq(q = LM, df = 1, lower.tail = F)

 ## Or:
 #lm2h.1 <- lm(educ ~ fatheduc + motheduc + exper + I(exper^2), data=mroz.w)
 #lm2h.2 <- lm(lwage ~ educ + exper + I(exper^2) + lm2h.1$res, data=mroz.w)
 #coeftest(lm2h.2, vcov=hc0)
 # The coefficient on lm2h.i$res is not significant at 10%, and so endogeneity 
 # is debatable.

 ## Hausman test for exogeneity of regressors:
 dwh.test <- function(model.iv, model.ols){
         cf_diff <- coef(model.iv) - coef(model.ols)
         vc_diff <- vcovHC(model.iv, "HC0") - vcovHC(model.ols, "HC0")
         x2_diff <- as.vector(t(cf_diff) %*% solve(vc_diff) %*% cf_diff)
         pchisq(q = x2_diff, df = dim(vc_diff)[1], lower.tail = F)
 }
 # execute dwh.test-function:
 dwh.test(lm2f.iv, lm2c.ols)

 # As they were designed the DWH test (p-value 0.09) and the Hausman test (0.16)
 # do not reject the hypothesis of exogeneity at 5%. Note, however, that also 
 # exper and expersq could be potentially endogenous. Other instruments should be 
 # added, should we consider them as such.