# ------------------------------------------------------------------- # Lecture 1: Normal distribution, estimators, sampling distributions of estimators, tests of hypotheses # Required libraries: -- rm(list=ls()) # source("http://klein.uk/R/myfunctions.R") ls() # ------------------------------------------------------------------- # --- Standard normal distribution --------------------------------- # Normal distribution: 68-95-99.7 or six sigma rule plot(dnorm(seq(-5,5,.1)) ~ seq(-5,5,.1), type="l") abline(h=0, v=c(-3,-2,-1,0)) # What proportion of observations are smaller than 0.83? (p.6) pnorm(0.83) # What proportion of observations are greater than -2.15? (p.7) 1 - pnorm(-2.15) # Inverse of SND: F^{-1}(0.03) = ? (p.8) qnorm(0.03) # Inventory example. (p.9ff) # What is the probability of a stockout? 1 - pnorm(20, mean=15, sd=6) # If the prob of stockout is to be no more than 5%, what should the reorder point be? qnorm(1-0.05, mean=15, sd=6) # --- Asymptotic properties of estimators (pages 33-35) ---------------- # Simulate probability distribution of sample mean. d = function(n) density( sapply(1:10000, function(x) mean(rnorm(n, mean=100, sd=50))) ) # Plot probability density of sample mean (sample size 1). plot( d(1), ylim=c(0,.08), main="Distribution of sample mean") abline(h=0) # Sample size 4, 25, and 100. sapply(c(4,25,100), function(x) lines( d(x) ) ) # --- Simulation: Sample variance, unbiased estimators (pages 37-39) ---- # --- PART A: Biased vs unbiased estimator for population variance. # What's special about n-1 in the equation for the sample standard deviation? # What would happen if we used n instead? # Unbiased sample variance with denominator n-1 var(0:1) # Biased sample variance with denominator n newvar = function(x) 1/length(x) * sum( (x-mean(x))^2 ) newvar(0:1) # 1,000 Simulations (sample size 10) and mean of biased vs unbiased sample variance s2 = sapply(1:1000, function(x){ sample10 = rnorm(10,mean=0,sd=1) c( var(sample10), newvar(sample10) ) } ) s2.unbiased = s2[1,]; mean(s2.unbiased) s2.biased = s2[2,]; mean(s2.biased) # --- PART B: Consistency of the biased variance estimator. # If the sample size is increased it ceases to matter whether we use n or n-1 in the denominator. # Simulate distribution of biased sample variance for different sample sizes d = function(n) density( sapply(1:10000, function(x) newvar(rnorm(n, mean=100, sd=50))) ) # Sample size 10 plot(d(10), ylim=c(0,.005)); abline(h=0, v=50^2) # Sample sizes 20, 100, and 1000 sapply(c(20,100,1000), function(x) lines(d(x)) ) # In summary, using n gives a biased estimate of the true variance. The smaller the sample size, the greater this # discrepancy between the unbiased and biased estimator. # --- Simulation: Central Limit Theorem (page 50) ------------------- # 10,000 draws from a uniform distribution. This is the parent distribution which is obviously non-Normal. x = runif(10000) hist(x) # To compute an average, two observations are drawn at random from the parent distribution and averaged. # Then another sample of two is drawn and another value average is computed. This process is repeated 10,000 times. x = sapply(1:10000, function(x) mean(runif(2)) ) # Distribution of averages of two hist(x, freq=F) # Repeatedly taking eight from the parent distribution and computing averages x = sapply(1:10000, function(x) mean(runif(8)) ) hist(x, freq=F) # Distribution of the mean approaches a Normal distribution lines(x=seq(0,1,.01), y=dnorm(seq(0,1,.01), mean(x), sd(x)), col="blue") # --- Simulation: Power of a test (pages 67-69) ------------------------- # Define hypothesis test function. h.test.A = function(n, mu){ sapply(mu, function(x) abs( mean( rnorm(n, mean=x, sd=1) ) ) > qnorm(0.975)/sqrt(n) ) } h.test.B = function(n, mu){ sapply(n, function(x) abs( mean( rnorm(x, mean=mu, sd=1) ) ) > qnorm(0.975)/sqrt(x) ) } # --- PART A: Power of a test and evidence against H0 ------------------- # Set values of sample size and population mean. n = 10 mu = c(0, .05, .1, .2, 1, 2) # Run simulation for sample size n=10 and population means of 0, .05, .1, .2, 1, and 2. data = sapply(1:10000, function(x) h.test(n=n, mu=mu)) # calculate percentage of rejections when null is not true (= power of test). rejections = sapply(1:6, function(x) sum(data[x,])/10000) # To see the test's power, graph the prob of rejecting H0 against the evidence. plot(rejections ~ mu, ylab="Prob of rejecting H0", xlab="Evidence against H0", main="Power of test") lines(lowess(rejections ~ mu, f=0.5)) abline(h=0.05) # horizontal line for size of the test, i.e., Prob(rejecting H0 | H0 true) # --- PART B: Power of a test and sample size --------------------------- # Set values of sample size and population mean. n = c(10, 100, 1000) mu = 0.2 # Run simulation for population mean mu=0.2 and sample size of 10, 100 and 1000. data = sapply(1:10000, function(x) h.test.B(n=n, mu=mu)) # calculate percentage of rejections when null is not true (= power of test). rejections = sapply(1:3, function(x) sum(data[x,])/10000) # To see the test's power, graph the prob of rejecting H0 against the evidence. plot(rejections ~ n, ylab="Prob of rejecting H0", xlab="Sample size", main="Power of test") lines(lowess(rejections ~ n, f=0.5)) # ------------------------------------------------------------------- # --- End of Session ------------------------------------------------