```# -------------------------------------------------------------------
# Lecture 2: T-distribution, simple linear regression, introduction to multiple regression

# Required libraries: car
rm(list=ls())
# source("http://klein.uk/R/myfunctions.R")
ls()
# -------------------------------------------------------------------

# --- Critical values for t and Normal distribution as n -> Inf, (p.8) ---
# 97.5% quantile of t-distribution with 10, 20, 30, 60, and 300 degrees of freedom
sapply(c(10,20,30,60,300), function(x) qt(p=0.975, df=x))
# 97.5% quantile of Normal distribution
qnorm(p=0.975)

# --- California test scores, Stock and Watson (2007, p.152), (Slides p.21) ---
install.packages("AER")
help("StockWatson2007", package = "AER")

## data and transformations
data("CASchools", package = "AER")
str(CASchools)
CASchools\$stratio <- with(CASchools, students/teachers)
CASchools\$score <- with(CASchools, (math + read)/2)

## scatterplot
library(car)
plot(score ~ stratio, data=CASchools, xlim=c(0,max(CASchools\$stratio)))

## linear model
lm1 <- lm(score ~ stratio, data = CASchools)
summary(lm1)
abline(lm1, col="green")
# what is the size of the effect in terms of quantiles?
quantile(CASchools\$stratio, probs=seq(0,1,.1))
quantile(CASchools\$score, probs=seq(0,1,.1))

# --- Standard Error of the Regression or: Residual standard error, (p.22) ---
sqrt(1/(420-2) * sum(lm1\$resid^2))
str(lm1)

# --- Without the model, the best estimate of Y_i is the sample mean, (p.23) ---
mean(CASchools\$score)
lm(score ~ 1, data=CASchools)

# --- Coefficient of determination or: R^2, (p.24) ---
ESS <- sum( (lm1\$fitted - mean(lm1\$fitted))^2 )
ESS/TSS

# --- Assumption 1: E(u|X=x)=0, (p.29) ---
scatterplot(lm1\$resid ~ CASchools\$stratio)

# --- Include a constant in the regression, (p.31) ---
e = 2 + rnorm(1000)
x = runif(1000,0,10)
y = 0 + x + e

plot(y ~ x, main="Include a constant in the regression")
abline( lm(y ~ x), col="blue", lwd=2)
abline( lm(y ~ -1 + x), col="red", lwd=2)
legend("topleft",legend=c("y = a + bx","y = bx"),lwd=2,lty=1,col=c("blue","red"))

# --- histrograms and qq-plots, (pp.40) ---
par(mfrow=c(1,2))
hist(lm1\$resid, 20, freq=FALSE)
lines(dnorm(seq(-50,50,.1),sd=18.58) ~ seq(-50,50,.1), col="blue")
qqnorm(lm1\$resid)
qqline(lm1\$resid)

# compare with histograms and qq-plots for a t-distribution (fait tailed!)
set.seed(12)
x = rt(150,10)
hist(x, 20, freq=FALSE, main="Histrogram of t-distribution with 10 d.f.")
lines(dnorm(seq(-6,6,.1),sd=sd(x)) ~ seq(-6,6,.1), col="blue")
qqnorm(x)
qqline(x)

# -------------------------------------------------------------------
# --- End of Session ------------------------------------------------
```