# -------------------------------------------------------------------
# Lecture 2: T-distribution, simple linear regression, introduction to multiple regression 

# Required libraries: car
  rm(list=ls())
  # source("http://klein.uk/R/myfunctions.R")
  ls()
# -------------------------------------------------------------------




# --- Critical values for t and Normal distribution as n -> Inf, (p.8) ---
 # 97.5% quantile of t-distribution with 10, 20, 30, 60, and 300 degrees of freedom
 sapply(c(10,20,30,60,300), function(x) qt(p=0.975, df=x))
 # 97.5% quantile of Normal distribution
 qnorm(p=0.975)




# --- California test scores, Stock and Watson (2007, p.152), (Slides p.21) ---
 install.packages("AER")
 help("StockWatson2007", package = "AER")

## data and transformations
 data("CASchools", package = "AER")
 str(CASchools)
 CASchools$stratio <- with(CASchools, students/teachers)
 CASchools$score <- with(CASchools, (math + read)/2)

## scatterplot
 library(car)
 plot(score ~ stratio, data=CASchools, xlim=c(0,max(CASchools$stratio)))

## linear model
 lm1 <- lm(score ~ stratio, data = CASchools)
 summary(lm1)
 abline(lm1, col="green")
 # what is the size of the effect in terms of quantiles?
 quantile(CASchools$stratio, probs=seq(0,1,.1))
 quantile(CASchools$score, probs=seq(0,1,.1))




# --- Standard Error of the Regression or: Residual standard error, (p.22) ---
 sqrt(1/(420-2) * sum(lm1$resid^2))
 str(lm1)




# --- Without the model, the best estimate of Y_i is the sample mean, (p.23) ---
  mean(CASchools$score)
  lm(score ~ 1, data=CASchools)




# --- Coefficient of determination or: R^2, (p.24) ---
 RSS <- sum(lm1$resid^2)
 ESS <- sum( (lm1$fitted - mean(lm1$fitted))^2 )
 TSS <- RSS + ESS
 ESS/TSS




# --- Assumption 1: E(u|X=x)=0, (p.29) ---
 scatterplot(lm1$resid ~ CASchools$stratio)




# --- Include a constant in the regression, (p.31) ---
 e = 2 + rnorm(1000)
 x = runif(1000,0,10)
 y = 0 + x + e

 plot(y ~ x, main="Include a constant in the regression")
 abline( lm(y ~ x), col="blue", lwd=2)
 abline( lm(y ~ -1 + x), col="red", lwd=2)
 legend("topleft",legend=c("y = a + bx","y = bx"),lwd=2,lty=1,col=c("blue","red"))




# --- histrograms and qq-plots, (pp.40) ---
 par(mfrow=c(1,2))
 hist(lm1$resid, 20, freq=FALSE)
 lines(dnorm(seq(-50,50,.1),sd=18.58) ~ seq(-50,50,.1), col="blue")
 qqnorm(lm1$resid)
 qqline(lm1$resid)

# compare with histograms and qq-plots for a t-distribution (fait tailed!)
 set.seed(12)
 x = rt(150,10)
 hist(x, 20, freq=FALSE, main="Histrogram of t-distribution with 10 d.f.")
 lines(dnorm(seq(-6,6,.1),sd=sd(x)) ~ seq(-6,6,.1), col="blue")
 qqnorm(x)
 qqline(x)




# -------------------------------------------------------------------
# --- End of Session ------------------------------------------------