############# ## Cviceni 1 ############# fev <- read.table("fev.txt", header=TRUE) dim(fev) fev[1:5, ] sum(duplicated(fev$ID)) fev <- fev[, -1] fev$Height <- 2.54*fev$Height summary(fev) ############ ## Cviceni 2 ############ # Descriptive statistics # ---------------------- plot(fev$FEV~fev$Age) lines(lowess(fev$FEV~fev$Age)) plot(fev$FEV~fev$Height) lines(lowess(fev$FEV~fev$Height)) plot(fev$Height~fev$Age) boxplot(fev$FEV~fev$Sex) boxplot(fev$FEV~fev$Smoker) barplot(height=prop.table(table(fev$Smoker, fev$Sex), margin=2), beside=F, ylab="Percentage", xlab="Gender", main="Smoking by gender") # ... # Basic model # ----------- model.basic <- lm(FEV~Height + Age + Sex + Smoker, data=fev) summary(model.basic) plot(model.basic, which=c(1:6)) # Quadratic dependence on height # ------------------------------ model.quad <- lm(FEV~Height + I(Height^2) + Age + Sex + Smoker, data=fev) summary(model.quad) plot(model.quad, which=c(1:6)) # Quadratic dependence on height, treating multicollinearity # ---------------------------------------------------------- model.quad.poly <- lm(FEV ~ poly(Height, 2) + Age + Sex + Smoker, data=fev) # the same model but the quadratic dependence on Height is fitted using orthogonal polynomials summary(model.quad.poly) plot(model.quad.poly, which=c(1:6)) # Quadratic dependence on height, possibly modified by gender # ----------------------------------------------------------- model.quad.inter <- lm(FEV~(Height + I(Height^2))*Sex + Age + Smoker, data=fev) summary(model.quad.inter) plot(model.quad.inter, which=c(1:6)) model.quad.inter.poly <- lm(FEV~poly(Height, degree=2)*Sex + Age + Smoker, data=fev) summary(model.quad.inter.poly) plot(model.quad.inter.poly, which=c(1:6)) # Quadratic dependence on height, possibly modified by gender and smoking # ----------------------------------------------------------------------- boxplot(resid(model.quad.inter.poly)~fev$Smoker) model.quad.inter.poly.smoker <- lm(FEV~(poly(Height, degree=2) + Sex + Smoker)^2 + Age, data=fev) summary(model.quad.inter.poly.smoker) plot(model.quad.inter.poly.smoker, which=c(1:6)) anova(model.quad.inter.poly, model.quad.inter.poly.smoker) # Quadratic dependence on height for boys, linear for girls # --------------------------------------------------------- model.quadboys <- lm(FEV~ Height*Sex + Sex + I(Height^2*as.numeric(Sex=="Male")) + Smoker + Age, data=fev) summary(model.quadboys) plot(model.quadboys, which=c(1:6)) anova(model.quadboys, model.quad.inter.poly) ############################## ## Model comparison by F tests ############################## anova(model.quad.poly, model.quad.inter.poly) anova(model.quad, model.quad.inter) anova(model.basic, model.quad) anova(model.basic, model.quad.inter) #################### ## Multicollinearity #################### library(car) vif(model.basic) vif(model.quad) vif(model.quad.poly) vif(model.quad.inter) vif(model.quad.inter.poly) vif(model.quadboys) ################## ## Cviceni 2/3 (a) ################## # Plot everything depending on height plot.all.by.height <- function(){ plot(fev$FEV~fev$Height, main="FEV by height", ylab="FEV [l]", xlab="Height [cm]", pch=19, col="deepskyblue") points(fev$FEV[fev$Smoker=="Non" & fev$Sex=="Female"]~fev$Height[fev$Smoker=="Non" & fev$Sex=="Female"], col="deeppink", pch=19) points(fev$FEV[fev$Smoker=="Current" & fev$Sex=="Male"]~fev$Height[fev$Smoker=="Current" & fev$Sex=="Male"], col="navyblue", pch=19) points(fev$FEV[fev$Smoker=="Current" & fev$Sex=="Female"]~fev$Height[fev$Smoker=="Current" & fev$Sex=="Female"], col="firebrick", pch=19) legend(x="topleft", col=c("deeppink", "firebrick", "deepskyblue", "navyblue"), legend=c("Non-smoking girls", "Smoking girls", "Non-smoking boys", "Smoking boys"), pch=19) } par(mfrow=c(1, 2)) plot.all.by.height() lines(predict(model.quad.inter.poly, newdata=data.frame(Height=sort(fev$Height), Sex="Female", Age=median(fev$Age), Smoker="Current"))~sort(fev$Height), lwd=3, col="firebrick", lty=2) lines(predict(model.quad.inter.poly, newdata=data.frame(Height=sort(fev$Height), Sex="Female", Age=median(fev$Age), Smoker="Non"))~sort(fev$Height), lwd=3, col="deeppink", lty=1) lines(predict(model.quad.inter.poly, newdata=data.frame(Height=sort(fev$Height), Sex="Male", Age=median(fev$Age), Smoker="Non"))~sort(fev$Height), lwd=3, col="deepskyblue", lty=1) lines(predict(model.quad.inter.poly, newdata=data.frame(Height=sort(fev$Height), Sex="Male", Age=median(fev$Age), Smoker="Current"))~sort(fev$Height), lwd=3, col="navyblue", lty=2) plot.all.by.height() lines(predict(model.quadboys, newdata=data.frame(Height=sort(fev$Height), Sex="Female", Age=median(fev$Age), Smoker="Current"))~sort(fev$Height), lwd=3, col="firebrick", lty=2) lines(predict(model.quadboys, newdata=data.frame(Height=sort(fev$Height), Sex="Female", Age=median(fev$Age), Smoker="Non"))~sort(fev$Height), lwd=3, col="deeppink", lty=1) lines(predict(model.quadboys, newdata=data.frame(Height=sort(fev$Height), Sex="Male", Age=median(fev$Age), Smoker="Non"))~sort(fev$Height), lwd=3, col="deepskyblue", lty=1) lines(predict(model.quadboys, newdata=data.frame(Height=sort(fev$Height), Sex="Male", Age=median(fev$Age), Smoker="Current"))~sort(fev$Height), lwd=3, col="navyblue", lty=2) #################### ## Cviceni 3 (b)+(c) #################### summary(model.quadboys) coef.quadboys <- coefficients(model.quadboys) confint(model.quadboys) parabola <- function(a, b, c, x){ return(a*x^2+b*x+c) } xx <- seq(min(fev$Height), max(fev$Height), length=501) yyS <- parabola(a=coef.quadboys[4], b=coef.quadboys[2]+coef.quadboys[7], c=coef.quadboys[1]+coef.quadboys[3]+coef.quadboys[6]*median(fev$Age), x=xx) yyN <- parabola(a=coef.quadboys[4], b=coef.quadboys[2]+coef.quadboys[7], c=coef.quadboys[1]+coef.quadboys[3]+coef.quadboys[6]*median(fev$Age)+coef.quadboys[5], x=xx) par(mfrow=c(1,1)) plot(yyS~xx, type="l", lwd=2, col="navyblue", xlab="Height [cm]", ylab="FEV [l]", main="Fitted dependence of FEV on Height for boys") lines(yyN~xx, type="l", lwd=2, col="deepskyblue") cov.quadboys <- vcov(model.quadboys) ci.prediction <- function(x){ est <- t(x)%*%coef.quadboys sd <- t(x)%*%cov.quadboys%*%x return(est+c(-1, 1)*qt(0.975, df=647)*sqrt(sd)) } xxx.N <- cbind(Intercept=1, Height=xx, Male=1, Height.2=xx^2, NonSmoke=1, Age=median(fev$Age), HeightMale=xx) xxx.S <- cbind(Intercept=1, Height=xx, Male=1, Height2=xx^2, NonSmoke=0, Age=median(fev$Age), HeightMale=xx) yy.ci <- data.frame(low.N=xx, up.N=NA, low.S=NA, up.S=NA) for(i in 1:length(xx)){ yy.ci[i, 1:2] <- ci.prediction(x=xxx.N[i, ]) yy.ci[i, 3:4] <- ci.prediction(x=xxx.S[i, ]) } lines(yy.ci$up.N~xx, type="l", lwd=2, lty=2, col="deepskyblue") lines(yy.ci$low.N~xx, type="l", lwd=2, lty=2, col="deepskyblue") lines(yy.ci$up.S~xx, type="l", lwd=2, lty=2, col="navyblue") lines(yy.ci$low.S~xx, type="l", lwd=2, lty=2, col="navyblue") ################ ## Cviceni 3 (d) ################ # podobne ako ci.prediction