library(faraway)
###################################################
####### Identifiability############################
data(gala)
# number of species of tortoise on various Galapagos Islands
gala
mdl <- lm(Species ~ Area + Elevation + Nearest + Scruz + Adjacent, data=gala)
summary(mdl)
x <- model.matrix( ~ Area + Elevation + Nearest + Scruz + Adjacent,gala)
y <- gala$Species #response

gala$Adiff <- gala$Area -gala$Adjacent # new variable
g <- lm(Species ~ Area+Elevation+Nearest+Scruz+Adjacent+Adiff,gala) 
# rank of design matrix is 6, columns is 7, u
summary(g)
Adiffe <- gala$Adiff+0.001*(runif(30)-0.5) # new variable 2, add random perturbations
g <- lm(Species ~ Area+Elevation+Nearest+Scruz+Adjacent+Adiffe,gala)
summary(g)
# standartni chyby odhadu VELMI velke

###################################################
############Testing examples######################
data(savings)
# sr - aggregate personal savings divided by disposable income
# dpi - pervapital disponsable income in U.S.dollars
# ddpi - percentage rate of change in percapita disponsable income
# pop15 - percentage of population under 15
# pop 75 - percentage of population over 75
savings
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, savings)
summary(g) # F-test
#testing just one-parameter
g2 <- lm(sr ~ pop75 + dpi + ddpi, savings) # T-test for pop15 rejected
(rss2 <- deviance(g2))
(fstat <- (deviance(g2)-deviance(g))/(deviance(g)/df.residual(g)))
1-pf(fstat,1,df.residual(g)) # F-test
anova(g2,g) #compare 2 nested models
########Testing pair of predictors###########################
# pop75+ddpi
g3 <- lm(sr ~ pop15+dpi , savings)
anova(g3,g)
################Testing a subspace################
# H0: beta_pop15=beta_pop75
g <- lm(sr ~ .,savings)
gr <- lm(sr ~ I(pop15+pop75)+dpi+ddpi,savings)
anova(gr,g) # H0 can not be rejected
# H0: beta_ddpi=05
# offset=fixed coefficient in the regression quation
gr <- lm(sr ~ pop15+pop75+dpi+offset(0.5*ddpi),savings)
anova(gr,g) # H0 is not rejected
(tstat <- (0.409695-0.5)/0.196197) # the same using t-statistics
2*pt(tstat,45)
tstat^2

######################################################
#####################Designed experiments#############
# we can controll X 
# ORTHOGONALITY 
# RANDOMIZATION

# Orthogovality - allows easily interpret the effect of one 
# predictor without regard to another
data(odor)
# effect of column temperature, gas/liquid ratio and packing height in
# reducing unpleasant odor of a chemical product for househol use
odor
x <- as.matrix(cbind(1,odor[,-1])) # X matrix, central composite design
t(x) %*% x    # X^T * X matrix is diagonal
g <- lm(odor ~ temp + gas + pack, odor) # Fit model
summary(g,cor=T) # write correlation
g <- lm(odor ~ gas + pack, odor) # drop one variable
summary(g) # coefficient themselves do not change, small changes in SE
# saving data - observational data
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, savings)
summary(g) 
g <- update(g, . ~ . - pop15)
summary(g) # pop75 had negative, now positive effect
#################Lurking variable##################
shoe size - reading ability at elementary school
age - lurking variable
##########################Observational data############## 
# 4 models, all include pop75, the signifficance and sign of effect is influenced
# by other variables in a model
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, savings)
summary(g)
g2 <- lm(sr ~ pop75 + dpi + ddpi, savings)
summary(g2)
g3 <- lm(sr ~ pop75 + ddpi, savings)
summary(g3)
g4 <- lm(sr ~ pop75, savings)
summary(g4)

##############################################################
########################Diagnostics#########################
# Error
# Model
# Unusual observations
#########
# Constatnt variance
data(savings)
g <- lm(sr ~ pop15+pop75+dpi+ddpi,savings)
plot(fitted(g),residuals(g),xlab="Fitted",ylab="Residuals")
abline(h=0)
plot(fitted(g),abs(residuals(g)),xlab="Fitted",ylab="|Residuals|")
summary(lm(abs(residuals(g)) ~ fitted(g)))
par(mfrow=c(3,3))
for(i in 1:9) plot(1:50,rnorm(50)) # Constant variance
for(i in 1:9) plot(1:50,(1:50)*rnorm(50)) # Strong non-constant variance
for(i in 1:9) plot(1:50,sqrt((1:50))*rnorm(50)) # Mild non-constant variance
for(i in 1:9) plot(1:50,cos((1:50)*pi/25)+rnorm(50))#non-linearity
par(mfrow=c(1,1))
# F test to compare variance
plot(savings$pop15,residuals(g),xlab="Population under 15",ylab="Residuals")
plot(savings$pop75,residuals(g),xlab="Population over 75",ylab="Residuals")
var.test(residuals(g)[savings$pop15>35],residuals(g)[savings$pop15<35])
# Tranformation
data(gala)
gg <- lm(Species ~ Area + Elevation + Scruz + Nearest + Adjacent, gala)
plot(fitted(gg),residuals(gg),xlab="Fitted",ylab="Residuals")
gs <- lm(sqrt(Species) ~ Area+ Elevation+ Scruz+ Nearest+ Adjacent, gala)
plot(fitted(gs),residuals(gs),xlab="Fitted",ylab="Residuals",main="Square root Response")
###################
# Normality
qqnorm(residuals(g),ylab="Residuals")
qqline(residuals(g))
hist(residuals(g))
par(mfrow=c(3,3))
for(i in 1:9) qqnorm(rnorm(50)) # normal
for(i in 1:9) qqnorm(exp(rnorm(50))) # log-normal - skewed distribution
for(i in 1:9) qqnorm(rcauchy(50)) #Cauchy - log-tailed distribution
for(i in 1:9) qqnorm(runif(50)) # Uniform - short tailed distribution
par(mfrow=c(1,1))
shapiro.test(residuals(g))
######################
#Correlated errors
data(airquality)
airquality
pairs(airquality,panel=panel.smooth)
g <- lm(Ozone ~ Solar.R + Wind + Temp,airquality,na.action = na.exclude)
summary(g)
plot(fitted(g),residuals(g),xlab="Fitted",ylab="Residuals")
# g1 with omitted incomplete cases, transformation to stabilize non-linearity and heterosked.
gl <- lm(log(Ozone) ~ Solar.R + Wind + Temp,airquality,na.action=na.exclude)
plot(fitted(gl),residuals(gl),xlab="Fitted",ylab="Residuals")
# index plot of residuals - if there is serial correlation 
#  positive -> long runs abowe or bellow line
#  negative -> greater than normal fluctuation
plot(residuals(gl),ylab="Residuals")
abline(h=0)
# no signofficant correlation
plot(residuals(gl)[-153],residuals(gl)[-1], xlab=expression(hat(epsilon)[i]),ylab=expression(hat(epsilon)[i+1]))
summary(lm(residuals(gl)[-1] ~ -1+residuals(gl)[-153]))
library(lmtest) # instal library
#durbin-watson test, p=0.3347
dwtest(Ozone ~ Solar.R + Wind + Temp,data=na.omit(airquality))
####################################
######Unusual observations
#leverage
g <- lm(sr ~ pop15 + pop75 + dpi + ddpi, savings)
ginf <- influence(g)
ginf$hat
sum(ginf$hat)
countries <- row.names(savings)
halfnorm(lm.influence(g)$hat,labs=countries,ylab="Leverages") #outliers diverge from others
gs <- summary(g)
gs$sig
stud <- residuals(g)/(gs$sig*sqrt(1-ginf$hat)) # studentized residuals
qqnorm(stud)
abline(0,1)
#Outliers
jack <- rstudent(g) # jacknife residuals
jack[which.max(abs(jack))]
qt(.05/(50*2),44)
data(star)
plot(star$temp,star$light,xlab="log(Temperature)",ylab="log(Light Intensity)")
ga <- lm(light ~ temp, star)
abline(ga)
range(rstudent(ga))
ga <- lm(light ~ temp, data=star, subset=(temp>3.6))
abline(ga,lty=2)
# Influential observations
# An influential point is a one, whose removal from the dataset could cause a large change in fit
cook <- cooks.distance(g) # Cook statistics - combines residual + leverage effect
halfnorm(cook,3,labs=countries,ylab="Cook's distances")
gl <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(cook < max(cook)))
summary(gl)
summary(g) # coefficient for ddpi changed about 50%
# leave out differences in coefficients
plot(ginf$coef[,2],ylab="Change in pop15 coef")
identify(1:50,ginf$coef[,2],countries) # press ESCAPE
gj <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(countries != "Japan"))
summary(gj) # Japan removed, ddpi - term no longer signifficant
# Checking the stucture of model - SKIP
#d <- residuals(lm(sr ~ pop75 + dpi + ddpi,savings))
#m <- residuals(lm(pop15 ~ pop75 + dpi + ddpi,savings))
#plot(m,d,xlab="pop15 residuals",ylab="Savings residuals")
#coef(lm(d ~ m))
#coef(g)
#abline(0,coef(g)['pop15'])
#plot(savings$pop15,residuals(g)+coef(g)['pop15']*savings$pop15,xlab="pop'n under 15", ylab="Savings(Adjusted)")
#abline(0,coef(g)['pop15'])
#prplot(g,1)
#g1 <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(pop15 > 35))
#g2 <- lm(sr ~ pop15+pop75+dpi+ddpi,savings,subset=(pop15 < 35))
#summary(g1)
#summary(g2) 
##########################################################
#################Problems with predictors###############

# Change of scale
data(savings)
g <- lm(sr ~ pop15+pop75+dpi+ddpi,savings)
summary(g)
g <- lm(sr ~ pop15+pop75+I(dpi/1000)+ddpi,savings) # in thousands of dollars
summary(g)
#scaling - convert variables to standart units (mean=0, variance=1)
scsav <- data.frame(scale(savings))
g <- lm(sr ~ ., scsav) # coefficients -1...1, kind of partial correlation
summary(g)
#colinearity
data(seatpos)
g <- lm(hipcenter ~ ., seatpos)
summary(g)
round(cor(seatpos),3) # pair-wise correlations
x <- model.matrix(g)[,-1]
e <- eigen(t(x) %*% x) # eigendecomposition
e$val
sqrt(e$val[1]/e$val) # problem with more then one linear combinations
summary(lm(x[,1] ~ x[,-1]))$r.squared
1/(1-0.49948) # VIF for the first variable
vif(x) # standart error sqrt(307.4)=17.5 is times larger, that would be without colinearity
# nonstability!
g <- lm(hipcenter+10*rnorm(38) ~ ., seatpos) # + random perturbation
summary(g)
round(cor(x[,3:8]),2)
g2 <- lm(hipcenter ~ Age + Weight + Ht, seatpos) # amputation
summary(g2) 
# OR ridge regression, when we must keep all variables















































