# -------------------------------------------------------------------
# MPO1 Quantitative Research Methods
# Thilo Klein
# Lab Session 1: General Introduction to R; Descriptive Statistics

# Required libraries: Rcmdr, timeDate
  source("http://klein.uk/R/myfunctions.R")
  ls()
# -------------------------------------------------------------------




# --- Ex 1: find help; R Commander; install/load packages -------------------
 ?memory.size                    # help if exact command is known
 help.search("memory size")  # help if exact command is not knowm
 install.packages("Rcmdr")   # install package Rcmdr
 library(Rcmdr)                  # load package




# --- Ex 2: Load a dataset and save it with a different name ----------------
 dataset2 <- read.csv("http://klein.uk/R/dataset", header=T)  # read dataset from .csv file
 ls()                                                           # display active objects in workspace
 save("dataset2", file="dataset2.RData")                # save active object dataset2
 rm(dataset2)                                           # clear object dataset2 from workspace
 rm(list=ls())                                          # clear workspace
 load("C:/.../dataset2.RData")                          # load object dataset2




# --- Ex 4: Provide description(s) for one or more variables ------------------
 str(dataset2)                      # some characteristics of all variables 
 is.double(dataset2$rev)        # is variable rev of type double?




# --- Ex 5: Inspect and modify the dataset ------------------
 attach(dataset2)                           # directly call the variables, to undo: detach()
 edit(dataset2)                                 # open AND modify dataset
 edit(year)                                             # open AND modify single variable
 browse(dataset2)                           # open dataset
 cbind(comp_name, year, level, capital)
 cbind(comp_name, year, level, capital)[level > 3 & is.na(level)==FALSE]




# --- Ex 6: Create (generate) a new variable ------------------
# --- Ex 6: a) ---
# Calculate output per employee for all enterprises.

 totemp <- softemp + othemp
 productivity <- rev / totemp


# --- Ex 6: b) ---
# generate a variable containing revenue squared.

 revsq <- rev^2
 lrev <- log(rev)
 ltotemp <- log(totemp)




# --- Ex 7: Generate an indicator variable ------------------
 year2002 <- ifelse(year == 2002, 1, 0)
 cbind(year, year2002)
 compnameis <- paste("company name is:", comp_name, sep=" ")
 compnameis[1:10]




# --- Ex 8: Calculate summary statistics. Identify missing values and outliers. Note that the latter may be the result of mistakes. Calculate proportions of individuals presenting a certain characteristic. ------------------
 summary(dataset2)
 NA%in%level
 length(level[is.na(level)])




# --- Ex 9: Calculate additional summary statistics but only for a few variables, or a subset of observations. ------------------
 sd(rev)
 var(rev)
 help.search("Skewness"); library(timeDate); skewness(rev)
 sd(rev[level>2])




# --- Ex 10: Analysis of the frequency of discrete variables ------------------
 table(pub)




# --- Ex 11: Analysis of the frequency of a combination of discrete variables (two-way tables) ------------------
 table(pub, level)
 t<-table(pub, level)
 t/sum(t)




# --- Ex 12: Analysis of a variable conditioned on a discrete variable ------------------
 ?by
 by(data=totemp, INDICES=us, FUN=summary)
 by(totemp, us, summary)
 by(totemp, india, summary); by(rev, india, summary)




# --- Ex 13: Graphs ------------------
# --- Ex 13: a) ---
# Scatter plots

 plot(rev ~ totemp) # plot(totemp, rev)
 plot(lrev ~ ltotemp)


# --- Ex 13: b) ---
# Histograms

 hist(rev)
 hist(rev[rev<1000])
 hist(rev[rev<500], breaks=50)
 par(mfrow=c(2,1))
 hist(rev[rev<200 & uk==1], breaks=50); hist(rev[rev<200 & uk==0], breaks=50)




# --- Ex 14: Drop variables or observations ------------------
 dataset2$dom <- NULL




# --- Ex 15: Generate linear transformations ------------------
 othemp_lt <- 1 + othemp/100
 sum(othemp)
 mean(othemp)
 1 + mean(othemp)/100
 othemp_qt <- 1 + othemp/100 + 5 * othemp^2
 plot(othemp_lt ~ othemp)
 plot(othemp_qt ~ othemp)




# --- Ex 16: Some properties of the sample variance ------------------
 cor(othemp, softemp, use=" pairwise.complete.obs")
 cov(othemp, softemp, use=" pairwise.complete.obs")


# --- Ex 16: a) ---
# generate a variable which is the sum and  apply the variance. You have created the 
# variable (totemp), so you need to apply the variance.

 myvar <- function(x){var(x, na.rm=T)}
 myvar(totemp)


# --- Ex 16: b) ---
# Apply the formula: Var (X+Y) = Var (X) + Var (Y) + 2 * covariance (X, Y)

 myvar(othemp) + myvar(softemp) + 2*cov(othemp, softemp, use="pairwise.complete.obs")

 # the same applies to the mean:
 mymean <- function(x){mean(x,na.rm=T)}
 mymean(othemp) + mymean(softemp)




# --- Ex 19: T-tests (in your own time) ------------------
# --- Ex 19: b) ---
# Using R, perform this test and calculate the confidence interval at 99% 
# significance level.

 data <- scan("clipboard")
 mean(data) + c(-1,1)*sd(data)*qnorm(0.99)
 t.test(data, mu=60, alternative="less", conf.level=0.99)




# --- Digression: my function to count NAs per variable ---
 source("http://klein.uk/R/myfunctions.R")
 showNAs(dataset2)





# -------------------------------------------------------------------
# --- End of Session ------------------------------------------------

 q("no")