# -------------------------------------------------------------------
# Master in Finance Econometrics Module
# Thilo Klein
# Lab Session 1: Descriptive Statistics and Linear Regression in R

# Required libraries: Rcmdr, timeDate
  source("http://klein.uk/R/myfunctions.R")
  ls()
# -------------------------------------------------------------------




# --- Ex I-1: find help; R Commander; install/load packages -------------------
 ?memory.size               # help if exact command is known
 help.search("memory size")  # help if exact command is not knowm
 install.packages("Rcmdr")   # install package Rcmdr
 library(Rcmdr)             # load package




# --- Ex I-2: Load a dataset and save it with a different name ----------------
 eaef2 <- read.csv("http://klein.uk/R/eaef", header=T)  # read dataset from .csv file
 ls()                                                   # display active objects in workspace
 save("eaef2", file="eaef2.RData")              # save active object eaef2
 rm(eaef2)                                              # clear object eaef2 from workspace
 rm(list=ls())                                  # clear workspace
 load("eaef2.RData")                            # load object eaef2




# --- Ex 1: Calculate summary statistics ----------------
 str(eaef2)
 summary(eaef2)
 attach(eaef2)

 hist(weight, col="grey", main="Histogram of Weight", xlab="weight in pounds")




# --- Ex 2: Calculate proportions of observations presenting a certain characteristic ----------------
# --- Ex 2: a) ---
# What is the proportion of observations with 3 siblings? 

 length(siblings[siblings==3])
 length(siblings)
 118/540

 # or:
 t <- table(siblings)
 round( t/sum(t), 2)*100


# --- Ex 2: b) ---
# What is the proportion of observations with weight less than 120?

 length(weight[weight<120])
 78/540

 # or:
 dim(subset(eaef2, weight<120))[1]/dim(eaef2)[1]




# --- Ex 3: Analysis of the frequency of discrete variables ----------------
# --- Ex 3: a) ---
# Tabulate the count, percentage, cumulative count and cumulative percentage for 
# every given number of siblings (from 0 to 13).

 table(siblings)                                # count
 t <- table(siblings)                   # percentage
 round( t/sum(t), 4 )*100
 cumsum(t)                                      # cumulative count
 round( cumsum(t/sum(t)), 4)*100        # cumulative percentage


# --- Ex 3: b) ---
# What is the proportion of observations with 3 siblings?

 t[4]/sum(t)




# --- Ex 4: Frequency of a combination of discrete variables (two-way tables) ----------------

 eaef2$agecut <- cut(age, breaks=quantile(age))
 attach(eaef2)
 table(agecut, siblings)




# --- Ex 5: Analysis of a variable conditioned on a discrete variable ----------------
# --- Ex 5: a) ---
# Evaluate the mean earnings conditioned on the four age-groups defined above.

 by(data=earnings, INDICES=agecut, FUN=mean)


# --- Ex 5: b) ---
# Plot histograms for earnings conditioned on the four age-groups.

 par(mfrow=c(2,2))
 by(data = earnings, INDICES = agecut, FUN = hist)




# --- Ex 6: Graphs ----------------

 plot(height ~ weight)
 hist(earnings)




# --- Ex 7: Generate linear transformation ----------------
 eaef2$ptearnings <- 2 + (earnings-2)*0.8
 eaef2$earnings <- log(earnings)




# --- Ex 8: T-tests ----------------
# --- Ex 8: b) ---
# Using R, perform this test and calculate the confidence interval at 99% 
# significance level.

 data <- scan("clipboard")
 mean(data) + c(-1,1)*sd(data)*qnorm(0.99)
 t.test(data, mu=60, alternative="less", conf.level=0.99)




# --- Ex 9: The linear model ----------------
# --- Ex 9: a) ---
# Review the contents and regress employment growth on GDP growth. Provide an 
# interpretation of the results.

 growth <- read.csv("http://klein.uk/R/growth",header=T,sep=",")
 str(growth)
 lm9 <- lm(empgrow ~ GDPgrow, data=growth); summary(lm9)


# --- Ex 9: b) ---
# Visually inspect data and regression line. 

 plot(empgrow ~ GDPgrow, data=growth)
 abline(lm, col="red")


# --- Ex 9: c) ---
# Are the coefficients significant? 

 # compare t-value of b2 (5.8) to 97.5% quantile of t-dist (2.07) and reject H0: b2=0.
 qt(p=1-0.025, df=23)
 qt(p=0.025, df=23, lower.tail=F)


# --- Ex 9: f) ---
# Build a confidence interval for the slope. 

 b_1 <- lm9$coef[1]
 sd_b1 <- summary(lm9)$coef[1,2]
 t_0.975 <- qt(p=1-0.025, df=23)
 b_1 + c(-1,1)*sd_b1*t_0.975            # 95%-confidence interval for b_1




# --- Ex 10: The linear model with quadratic terms ----------------
 house <- read.csv("http://klein.uk/R/housing",header=T,sep=",")
 str(house)


# --- Ex 10: a) ---
# Estimate a model to test this, using total expenditure as a proxy for total income.

 lm10a <- lm(housing ~ total, data=house)
 summary(lm10a)


# --- Ex 10: b) ---
# Is a quadratic form more appropriate?

 house$totalsq <- house$total^2

 lm10b <- lm(housing ~ total + totalsq, data=house)
 summary(lm10b)




# --- Ex 11: Extrapolation and accuracy of least squares ----------------
 eaef <- read.csv("http://klein.uk/R/eaef",header=T,sep=",")
 str(eaef)

 lm11 <- lm(weight ~ height, data=eaef)
 summary(lm11)




# --- Ex 12: Estimates for changing units of measurement ----------------
 eaef$weight_grams <- eaef$weight*454           # one pound is 454 grams
 eaef$height_metric <- eaef$height * 2.54               # one inch is 2.54 cm

 lm(weight_grams ~ height, data=eaef)
 5.562*454   # =2525.148

 lm(weight ~ height_metric, data=eaef)
 5.562496/2.54   # =2.189959




# --- Ex 13: Multiple linear regression ----------------
 hprice1 <- read.csv("http://klein.uk/R/hprice1",header=T,sep=",")
 str(hprice1)

 lm13 <- lm(price ~ sqrft + bdrms, data=hprice1)
 summary(lm13)




# --- Ex 14: Reversal of regressor and regressand ----------------
 lm(earnings ~ schooling, data=eaef)
 lm(schooling ~ earnings, data=eaef)

 ?cor
 cor(eaef$earnings, eaef$schooling, method="pearson")




# --- Ex 15: Regression against a constant (optional) ----------------
 summary(eaef$weight)
 lm(weight ~ 1, data=eaef)
 # lm(weight ~ -1 + c(rep(2,540)), data=eaef)




# --- Ex 16: Confidence intervals for regression coefficients ----------------
 t5 <- qt(p=0.025, df=60-2, lower.tail=F)       # t-quantile 5%
 t1 <- qt(p=0.005, df=60-2, lower.tail=F) # t-quantile 1%

 myCI <- function(b, se, t){    # function to generate CI
   b + c(-1,1)*se*t
 }

 myCI(-.2, .07, t5)     # CI for beta=-0.2, se_beta=0.07; at 5% level
 myCI(-.12, .07, t5)    # CI for beta=-0.12, se_beta=0.07; at 1% level




# --- Digression: my function to count NAs per variable ---
 showNAs
 showNAs(dataset2)





# -------------------------------------------------------------------
# --- End of Session ------------------------------------------------

 q("no")