# -------------------------------------------------------------------
# Lecture 6: Multicollinearity, F-tests for goodness of fit, Non-linearity and Model Transformations, Dummy variables 

# Required libraries: AER, car
  rm(list=ls())
  source("http://klein.uk/R/myfunctions.R")
  ls()
# -------------------------------------------------------------------




# --- California school districts once again... ---

 data("CASchools", package = "AER")
 CASchools$stratio = with(CASchools, students/teachers)
 CASchools$score = with(CASchools, (math + read)/2)
 attach(CASchools)

 par(mfrow = c(2, 2))
 plot(score ~ income, main = "district average income")

## Part A: Linear model
 lm1 = lm(score ~ income)
 shccm(lm1)
 abline(lm1, col = "blue", lwd=2)
 plot(lm1, which = 1)  # Diagnostic residual plot


## Part B: Quadratic model
 income2 = income * income
 lm2 = lm(score ~ income + income2)
 shccm(lm2)

 or = order(income)   # calculates a permutation vector. 
 plot(score ~ income, main = "district average income")
 abline(lm1, col = "blue", lwd = 2)
 lines(lm2$fitted[or] ~ income[or], col = "red", lwd = 2)
 legend("bottomright", c("linear", "quadratic"), lwd = 2,
  col = c("blue", "red"))
 plot(lm2, which = 1)   # Dignostic residual plot

# Are income and income2 jointly significant?
 library(car)
 lht(lm2, c("income","income2"))

# Marginal effect of a change of income at income level 50k?
 lm2$coef["income"] + 2 * 50 * lm2$coef["income2"]
# Is marginal effect at income level 50k significantly different from 0?
 lht(lm2, "income + 100*income2")




# --- Polynomials ---

 par(mfrow=c(1,1))
 plot(score ~ income, main = "district average income")
 abline(lm1, col = "blue", lwd=2)

# Sequential hypothesis test in the case of polynomial models:

 lm2 = lm(score ~ income + income2)
 lines(income[or], fitted(lm2)[or], col = "red", lwd = 2)
 shccm(lm2)

 income3 = income * income * income
 lm3 = lm(score ~ income + income2 + income3)
 lines(income[or], fitted(lm3)[or], col = "green", lwd = 2)
 shccm(lm3)

 lmp = lm(score ~ poly(income, 10))
 smooth = list(income = seq(5, 70, 0.5))
 lines(smooth$income, predict(lmp, newdata = smooth),
  col = "magenta", lwd = 2)
 legend("bottomright", c("linear", "quadratic", "cubic",
  "10th-deg"), lwd = 2, col = c("blue", "red", "green", "magenta"))




# --- Digression: Rescale variables to have mean=0 and sd=1 in polynomial models ---
 z = (income - mean(income) / sd(income))
 lm4 = lm(score ~ z + I(z^2) + I(z^3))    # equivalent to model lm3, except for using z-scores
 shccm(lm4)            # lower sd and larger t-statistics of the coefs than model lm3!
 vif(lm3); vif(lm4)    # lower vif statistics than model lm3




# --- Logarithms (see handout) ---

 plot(score ~ income, main = "district average income")
 abline(lm1, col = "blue", lwd=2)
 lines(income[or], fitted(lm2)[or], col = "red", lwd = 2)
 legend("bottomright", c("linear", "quadratic", "lin-log",
 "log-lin", "log-log"), lwd = 3, col = c("blue", "red",
 "green", "black", "orange"))

# linear-log model
# if X_i changes by 1%, Yi changes by 0.01 * beta_1
 lmLiLo = lm(score ~ log(income)); lmLiLo
 lines(income[or], fitted(lmLiLo)[or], col = "green", lwd = 3)
# marginal effect at median income-level: 
 lmLiLo$coef[2]/median(income)

# log-linear model
# a change of X_i by one unit translates into a change of Y_i by beta_1 * 100%
 lmLoLi = lm(log(score) ~ income); lmLoLi
 lines(income[or], exp(fitted(lmLoLi))[or], col = "black", lwd = 3)

# log-log model
# beta_1 is the elasticity of Y_i with respect to X_i .
 lmLoLo = lm(log(score) ~ log(income)); lmLoLo
 lines(income[or], exp(fitted(lmLoLo))[or], col = "orange", lwd = 3)




# --- Dummy variables and Interactions ---

# Data on the demand for refrigerators
 fridge = read.csv("http://klein.uk/R/fridge2.csv")
 str(fridge)
 attach(fridge)

# Data summaries and linear regression
 summary(fridge[, c("ecost", "price")])
 plot(price ~ ecost, main="Energy cost and refrigerator price")
 lm1 = lm(price ~ ecost); summary(lm1)
 abline(lm1, col="blue", lwd=2)   # fitted line plot

# OVB problem
 scatterplotMatrix(fridge[, 1:3])
 cor(fridge[, 1:3])


# --- Digression: Graphical Representation of Correlation Matrix (PLOTCORR) ---

 # install.packages("ellipse")
 library(ellipse)
 # corC =  cor(data.frame(price, volume, ecost, top, side, bottom))
 corC =  cor(fridge[,1:3])
 colors = c("#A50F15","#DE2D26","#FB6A4A","#FCAE91","#FEE5D9","white",
            "#EFF3FF","#BDD7E7","#6BAED6","#3182BD","#08519C")
 plotcorr(corC, col=colors[5*corC + 6], type = "lower")

# --- End of Digression ---


# Mitigating OVB. Why can we not expect the following to be a good model?
 lm2 = lm(price ~ volume + ecost); summary(lm2)
 library(car)
 vif(lm2)

# What else? Freezer position can matter for price: dummy variables!
# What does the data look like?
 freezer = as.factor(freezer)
 levels(freezer) = c("bottom","top","side")

 # lm(price ~ volume + ecost + top + side + bottom)
 lm3 = lm(price ~ volume + ecost + freezer); summary(lm3)

# bottom effects (and others, e.g., E[u]!=0) are captured in the intercept
 lm3$coef["(Intercept)"]

# top intercept
 lm3$coef["(Intercept)"] + lm3$coef["freezertop"]

# side intercept
 lm3$coef["(Intercept)"] + lm3$coef["freezerside"]

# both differences (from bottom) are signif.
 lht(lm3, "freezerside"); lht(lm3, "freezertop")

# Is the effect of "top" different from "side"? 
# -> change the reference category to "side"
 freezer = relevel(freezer, ref="side")
 lm4 = lm(price ~ volume + ecost + freezer); summary(lm4)

# Is it reasonable to assume the slope coefficients are the same for all types 
# of fridges? See slope dummy variables

 # 3 separate models for top, bottom and side
 lm.top = lm(price ~ volume + ecost, data=fridge[freezer=="top",]); summary(lm.top)
 lm.bottom = lm(price ~ volume + ecost, data=fridge[freezer=="bottom",]); summary(lm.bottom)
 lm.side = lm(price ~ volume + ecost, data=fridge[freezer=="side",]); summary(lm.side)

 # slope dummies / interactions
 lm6 = lm(price ~ volume + ecost + freezer + volume:freezer + ecost:freezer); summary(lm6)
 vif(lm6)

# Do the slope coefs differ for top, side or bottom?
 lht(lm6, c("volume:freezerbottom","volume:freezertop","ecost:freezerbottom","ecost:freezertop"))

 levels(freezer)[2:3] = "topbottom"
 lm6 = lm(price ~ volume + ecost + freezer + volume:freezer + ecost:freezer); summary(lm6)
 lht(lm6, c("volume:freezertopbottom", "ecost:freezertopbottom"))




# -------------------------------------------------------------------
# --- End of Session ------------------------------------------------