```# -------------------------------------------------------------------
# MPO1 Quantitative Research Methods
# Thilo Klein
# Lab Session 3: Model Selection; Inference; Dummy Variables

# Required libraries: car, lmtest, zoo, sandwich
source("http://klein.uk/R/myfunctions.R")
ls()
# -------------------------------------------------------------------

# --- Digression: Heteroskedasticity Consistent Covariance Matrix (HCCM) ---
set.seed(123) # this way we will all have the same "random" variables
x <- runif(1000)
e <- rnorm(1000)
par(mfrow=c(1,2))

y.hom <- x + e # homoscedasticity
lm.hom <- lm(y.hom ~ x)
plot(y.hom ~ x, ylim=c(-2,3)); abline(lm.hom, col="red", lwd=2)

y.het <- x + e*x       # heteroscedasticity
lm.het <- lm(y.het ~ x)
plot(y.het ~ x, ylim=c(-2,3)); abline(lm.het, col="red", lwd=2)

vcov(lm.het)
??hccm
library(car)
hccm(lm.het, type="hc0")       # White-adjusted errors are R's default setting
hccm(lm.het, type="hc3")       # heteroscedasticity consistent errors used by Stata

# --- [load shccm function:] ---
source("http://klein.uk/R/myfunctions.R")

summary(lm.hom)                # similar results under homoscedasticity
shccm(lm.hom, "hc0")

summary(lm.het)                # different results under heteroscedasticity
shccm(lm.het)

rm(x, e, y.hom, y.het, lm.hom, lm.het)

# --- Ex 1: Multicollinearity; Omitted Variable Bias ------------------
set.seed(123)
epsilon <- rnorm(10000)
omega   <- rnorm(10000)
eta     <- rnorm(10000)
zeta    <- rnorm(10000)

x1 <- 5 + omega + 0.3* eta
x2 <- 10 + omega
x3 <- 5 + eta
y  <- 20 + x1 + x2 + epsilon
z  <- 30 + x2 + x3 + zeta

# --- Digression: visual test for normality ---
hist(qnorm(runif(10000)), freq=F)
grid.x <- seq(-4,4,.1)
grid.y <- dnorm(grid.x)
lines(grid.x, grid.y, col="blue", lwd=2)

# --- Ex 1: b) ---
cor(cbind(x1, x2, x3))
lm1b <- lm(y ~ x1 + x2)
shccm(lm1b)
vif(lm1b)              # VIF > 10, severe multicollinearity.

# --- Ex 1: c) ---
lm1c <- lm(z ~ x2 + x3)
shccm(lm1c)
vif(lm1c)

# --- Ex 1: d) ---
lm(y ~ x1)\$coef                # cov(x1,x2)=1 -> OVB for b2; Intercept biased
lm(y ~ x1 + x2)\$coef
lm(z ~ x2)\$coef                # true model: z=30+x2+x3 -> Intercept biased
lm(z ~ x2 + x3)\$coef

# --- Ex 2: Multicollinearity; Variance Inflation Factor ------------------
salary <- read.table("http://klein.uk/R/salary.txt", header=T, sep="	")
str(salary)
lm2 <- lm(LOGSAL ~ EDUC + LOGSALBEGIN + GENDER + MINORITY, data=salary)
shccm(lm2)
vif(lm2)

lm2ed <- lm(EDUC ~ LOGSALBEGIN + GENDER + MINORITY, data=salary)
print( paste( "The R2 is",  summary(lm2ed)\$r.squared ) )
print( paste( "variance inflation factor is",  1/(1- summary(lm2ed)\$r.squared) ) )

cor(subset(salary, select=c("EDUC","LOGSALBEGIN","GENDER","MINORITY")))
# Or:
attach(salary)
cor(cbind(EDUC, LOGSALBEGIN, GENDER, MINORITY))
detach(salary)

# --- Ex 3: The effects of having highly correlated regressors ------------------
eaef21 <- read.csv("http://klein.uk/R/eaef21", header=T)
str(eaef21)
lm3w <- lm( SIBLINGS ~ SM + SF, data = eaef21[eaef21\$ETHWHITE==1, ] )
lm3b <- lm( SIBLINGS ~ SM + SF, data = eaef21[eaef21\$ETHBLACK==1, ] )
lm3h <- lm( SIBLINGS ~ SM + SF, data = eaef21[eaef21\$ETHHISP==1, ] )
lm3w; lm3b; lm3h

library(lmtest); library(zoo); library(sandwich)
lm3w <- lm(SIBLINGS ~ SM + SF, data=eaef21[eaef21\$ETHWHITE==1, ]); coeftest(lm3w, vcov=hc0)
cor( cbind(eaef21\$SM, eaef21\$SF)[eaef21\$ETHWHITE==1, ] )
vif(lm3w)

lm3b <- lm( SIBLINGS ~ SM + SF, data=subset(eaef21, ETHBLACK ==1) ); coeftest(lm3b, vcov=hc0)
cor( subset(eaef21, ETHBLACK ==1, select=c(SM, SF)) )
vif(lm3b)

lm3h <- lm( SIBLINGS ~ SM + SF, data= subset(eaef21, ETHHISP==1) ); coeftest(lm3h, vcov=hc0)
cor( subset(eaef21, ETHHISP ==1, select=c(SM, SF)) )
vif(lm3h)

# --- Ex 3: b) ----
lm3.1 <- lm(SIBLINGS ~ SM + SF, data=eaef21); shccm(lm3.1)

linearHypothesis(model=lm3.1, "SM = SF", vcov=hc0)
eaef21\$SP <- eaef21\$SF + eaef21\$SM
lm3.2 <- lm(SIBLINGS ~ SP, data=eaef21); shccm(lm3.2)

# --- Ex 4: OLS assumptions; dummy variables (optional) ------------------
e <- rnorm(n=100, mean=1, sd=1)
x <- runif(100)
y <- 2*x + e
OLS <- lm(y ~ x); shccm(OLS)
par(mfrow=c(2,2))
plot(OLS)

# --- Ex 5: Bank wages ------------------
bank <- read.csv("http://klein.uk/R/bank", header=T)
str(bank)

bank\$logsalbegin <- log(bank\$salbegin)
lm5i <- lm(logsal ~ educ + logsalbegin + gender + minority, data=bank); shccm(lm5i)

# --- Ex 5: ii) ---

# 1="clerical"   2="custodial"  3="managerial"
bank\$jobcat.cler <- ifelse(bank\$jobcat==1, 1, 0)
bank\$jobcat.cust <- ifelse(bank\$jobcat==2, 1, 0)
bank\$jobcat.man <- ifelse(bank\$jobcat==3, 1, 0)
lm5iia <- lm(logsal ~ educ + male + minority + jobcat.cler + jobcat.cust, data=bank); shccm(lm5iia)

bank\$jobcat.fac <- factor(bank\$jobcat)
levels(bank\$jobcat.fac) <- c("Clerical", "Custodial", "Managerial")
lm5iib <- lm(logsal ~ educ + male + minority + jobcat.fac, data=bank); shccm(lm5iib)
bank\$jobcat.fac <- relevel(bank\$jobcat.fac, ref="Managerial")  # change reference category/level

linearHypothesis(model=lm5iia, "jobcat.cler = jobcat.cust", vcov=hc0)

# --- Digression: how to categorize a continuous variable ---
bank\$educ.cut <- cut(bank\$educ, breaks=3)                      # categorize variable in 3 categories/levels
bank\$educ.cut <- cut(bank\$educ, breaks=c(0,12,16,Inf)) # alternatively: define categories yourself

write.csv(bank, "bank_cat.csv")                                        # write data-frame to csv-file

# --- Ex 5: iii) ---
table(bank\$jobcat.fac)
lm(logsal ~ educ + male + minority, data=subset(bank, jobcat==2) )
lm(logsal ~ educ + male + minority, data=subset(bank, jobcat==3) )
sum(bank\$male[bank\$jobcat==2])

# --- Ex 5: iv) ---
bank\$female <- 1 - bank\$male
bank\$femaleandminority <- bank\$female * bank\$minority                          # either this
lm5iv <- lm(logsal ~ educ + male + minority + femaleandminority, data=bank)    # ..
lm(logsal ~ educ + male + minority + female:minority, data=bank)                       # or that.

# --- Ex 5: v) ---
linearHypothesis(model=lm5iv, "male = minority", vcov=hc0)

# --- Ex 5: vi) ---
lm5vi <- lm(logsal ~ educ + female + minority , data=bank); shccm(lm5vi)
linearHypothesis(lm5vi, "educ = 0.07", vcov=hc0)
linearHypothesis(lm5vi, c("educ = 0.07", "female = minority"), vcov=hc0)

# --- Ex 5: vii) ---
lm(logsal ~ educ + female + minority + logsalbegin + logsalbegin:jobcat.man, data=bank)

# --- Ex 6: NO2 pollution ------------------
no2poll <- read.csv("http://klein.uk/R/no2pollution", header=T)
str(no2poll)

lm6 <- lm(lno2 ~ lcars + temp + tchng23 + wndspd + wnddir + day, data=no2poll); shccm(lm6)
linearHypothesis(lm6, "wndspd = wnddir", vcov=hc0)

# --- Ex 7: Programming in R ------------------
data <- read.csv("http://klein.uk/R/dataset", header=T)
str(data)

# source("Rprogramming.R")

# -------------------------------------------------------------------
# --- End of Session ------------------------------------------------

q("no")
```