```# -------------------------------------------------------------------
# Master in Finance Econometrics Module
# Thilo Klein
# Lab Session 3: Generalized Least Squares

# Required libraries: VGAM, zoo, timeDate, tseries
source("http://klein.uk/R/myfunctions.R")
ls()
# -------------------------------------------------------------------

# --- Ex 1: Heteroskedasticity (1) ----------------
# --- Ex 1: a) ---
# Use the data in hprice1.csv to obtain heteroskedasticity-robust standard errors
# and homoskedastic-only standard errors for equation: price ~ lotsize + sqrft + bdrms.
# Discuss any important difference with the usual homoskedasticity-only standard errors.

house <- read.csv("http://klein.uk/R/hprice1", header=T)
str(house)

lm1a <- lm(price ~ lotsize + sqrft + bdrms, data=house)
summary(lm1a)
shccm(lm1a)

# --- Ex 1: b) ---
# Repeat part a) for log(price) ~ log(lotsize) + log(sqrft) + bdrms.

lm1b <- lm(lprice ~ llotsize + lsqrft + bdrms, data=house)
summary(lm1b)
shccm(lm1b)

# --- Ex 1: c) ---
# What does this example suggest about heteroskedasticity and the transformation used
# for the dependent variable?

# refer to handout.

# --- Ex 1: d) ---
# Apply the full White-test for heteroskedasticity to part b). Which variables does it
# apply? Using the chi-squared form of the statistic, obtain the p-value. What do you
# conclude?

house\$lm1b.sqres <- lm1b\$residuals^2
lm1b.white.test <- lm(lm1b.sqres ~ llotsize*lsqrft*bdrms - llotsize:lsqrft:bdrms
+ I(llotsize^2) + I(lsqrft^2) + I(bdrms^2), data=house); shccm(lm1b.white.test)
T <- summary(lm1b.white.test)\$r.squared * nrow(house)
library(VGAM)
pchisq(q=T, df=9, lower.tail=F)        # =0.39 -> little evidence against homoskedasticity assumption

# --- Ex 2: Heteroskedasticity (2) ----------------
training <- read.csv("http://klein.uk/R/training", header=T)
str(training)

# --- Ex 2: a) ---
# Consider the simple regression model log(scrap)=b_1 + b_2 grant + u, where scrap is
# the firm scrap rate, and grant is a dummy variable indicating whether a firm received
# a job training grant. Can you think of sum reasons why the unobserved factors in u
# might be correlated in grant?

lm2a <- lm(log(scrap) ~ grant, data=training); summary(lm2a)
par(mfrow=c(2,2))
plot(lm2a)

# --- Ex 2: b) ---
# Estimate a simple regression model. Does receiving a job-training grant significantly
# lower a firm's scrap rate?

lm2b <- lm(log(scrap) ~ grant, data=training); summary(lm2b)

# --- Ex 2: c) ---
# Now, add as an a explanatory variable log(scrap_1) (this variable is the scrap rate
# of the previous year). How does this change the estimated effect of grant? Is it
# statistically significant at the 5% level?

lm2c <- lm(log(scrap) ~ grant + log(scrap_1), data=training); summary(lm2c)
plot(lm2c)

# --- Ex 2: d) ---
# Test the null hypothesis that the parameter on log(scrap_1) is 1 against the
# two-sided alternative. Report the p-value for the test.

linearHypothesis(model=lm2c, "log(scrap_1) = 1")

# --- Ex 2: e) ---
# Repeat parts c) and d) using heteroscedasticity-robust standard errors, and briefly
# discuss any notable differences.

shccm(lm2c)
linearHypothesis(model=lm2c, "log(scrap_1) = 1", white.adjust=TRUE)
# linearHypothesis(model=lm2c, "log(scrap_1) = 1", white.adjust="hc3")

# --- Ex 3: Autocorrelation ----------------
bond <- read.csv("http://klein.uk/R/bonds", header=T)
str(bond)

# --- Ex 3: a) ---
# Regress changes in AAA bond returns (daaa) on US Treasury Bill interest rates (dus3mt).
# Plot the residuals. Are the residuals distributed evenly across time?

lm3a <- lm(daaa ~ dus3mt, data=bond); shccm(lm3a)

library(zoo)
bond\$paneldate <- as.yearmon(bond\$paneldate,format="%Ym%m")
e <- lm3a\$res
plot(e ~ bond\$paneldate, type="l")

# --- Ex 3: b) ---
# Investigate serial autocorrelation in residuals. Use the Breusch-Godfrey Serial
# Correlation LM Test.

N <- length(e)
e1 <- c(NA, e[1:(N-1)])
e2 <- c(NA, NA, e[1:(N-2)])
head(data.frame(bond\$paneldate, e, e1, e2))
plot(e ~ e1)
cor(e, e1, use="complete")
abline(a=0, b=0.2761491, col="red", lwd=2)

# --- Breusch-Godfrey Serial Correlation LM Test: ---
# 1. fit auxiliary regression of estimated error terms against all independent variables and
# lagged estimated residuals up to order p (here: p=2)

lm3bBG <- lm(e ~ bond\$dus3mt + e1 + e2); shccm(lm3bBG)

# 2. under H0: "no autocorrelation in error terms up to order p" the test statistic n*R^2
# follows a Chi-squared distribution with p degrees of freedom.

n <- length(e)
R2 <- summary(lm3bBG)\$r.sq
n*R2                   # test statistic
qchisq(p=0.95,df=2)    # theoretical quantile of the Chi-squared distribution under H0

# or simply:

library(lmtest)
bgtest(daaa ~ dus3mt, data=bond, order=2, type="Chisq")

# --- Durbin-Watson Test for Autocorrelated Errors ---
?durbinWatsonTest
durbinWatsonTest(lm3a, max.lag=1, alternative="positive")
# or:
dwtest(daaa ~ dus3mt, data=bond, alternative="greater")

# --- Ex 4: Non-linearity in variables ----------------
# --- Ex 4: a) ---
# Fit a regression model of volume on t (a time trend)
nyse <- read.csv("http://klein.uk/R/nysevolume", header=T)
str(nyse)

lm4a <- lm(volume ~ t, data=nyse); shccm(lm4a)

# --- Ex 4: b) ---
# Examine the residuals. Assess the linearity of the relation between volume and
# the time trend.

par(mfrow=c(2,2))
plot(lm4a)

par(mfrow=c(1,1))
plot(volume ~ t, data=nyse)
abline(lm4a, col="red", lwd=2)

# --- Ex 4: c) ---
# Generate a log transformation of the variable volume, call it logvol. Run the
# regression in a. with this new variable. What happens now with the residual?

lm4c <- lm(log(volume) ~ t, data=nyse); shccm(lm4c)
par(mfrow=c(2,2))
plot(lm4c)

# --- Ex 4: d) ---
# Now run the following model (and analyse again the residuals):
# log(volume) ~ t + t^2

lm4d <- lm(log(volume) ~ t + I(t^2), data=nyse); shccm(lm4d)
plot(lm4d)

# --- Ex 5: Normality ----------------
# --- Ex 5: a) ---
# Regress changes in AAA bond returns (daaa) on US Treasury Bill interest rates (dus3mt).
# Obtain the histogram of the residuals.

bond <- read.csv("http://klein.uk/R/bonds", header=T)
str(bond)

lm5a <- lm(daaa ~ dus3mt, data=bond); shccm(lm5a)
hist(lm5a\$res, breaks=30)

library(timeDate)
sk <- skewness(lm5a\$res, method="moment"); sk
ku <- kurtosis(lm5a\$res, method="moment"); ku

# --- Ex 5: b) ---
# Analyse the Jarque-Bera test of normality results.

n <- length(lm5a\$res)
(n-2)/6 *( sk^2 + ((ku - 3)^2)/4 )     # Jarque-Bera test statistic (handout, p.7)
qchisq(p=0.95, df=2)                   # theoretical distribution under the null

# or simply:

library(tseries)
jarque.bera.test(lm5a\$res)

# --- Ex 6: Outliers ----------------
crime <- read.csv("http://klein.uk/R/crime", header=T)
str(crime)

# --- Ex 6: a) ---
# A regression model for crime might have pctmetro, poverty, and single as independent
# variables. Run this regression and plot the residuals.

lm6a <- lm(crime ~ pctmetro + poverty + single, data=crime); shccm(lm6a)
par(mfrow=c(2,2))
plot(lm6a)

# --- Ex 6: b) ---
# Identify what is the observation that is an outlier.

crime\$rstudent <- rstudent(lm6a)
crime <- crime[order(crime\$rstudent), ]
head(crime)
tail(crime)

# --- Ex 6: c) ---
# Try to solve this problem adding to the regression an impulse dummy variable.

crime\$DC <- ifelse(crime\$state=="dc", 1, 0)
lm6c <- lm(crime ~ pctmetro + poverty + single + DC, data=crime); shccm(lm6c)
lm6c <- lm(crime ~ pctmetro + poverty + single, data=subset(crime, state!="dc")); shccm(lm6c)

# --- Digression: Graphical Analysis of outlier ---
plot(crime ~ single, data=crime, col="white"); text(x=crime\$single, y=crime\$crime,   labels=crime\$state)
m.pctmetro <- mean(crime\$pctmetro)
m.poverty <- mean(crime\$poverty)
r.single <- seq(min(crime\$single),max(crime\$single),.1)
myReg <- function(x, model){
coef(model)%*%c(1, m.pctmetro, m.poverty, x)
}
y <- sapply(r.single, myReg, model=lm6a)
lines(x=r.single, y=y, col="red", lwd=2)
y <- sapply(r.single, myReg, model=lm6c)
lines(x=r.single, y=y, col="blue", lwd=2)
legend("topleft", legend=c("with DC","without DC"), fill=c("red","blue"))

# -------------------------------------------------------------------
# --- End of Session ------------------------------------------------

q("no")
```