example.R

david — Jul 14, 2014, 11:57 PM

## example of use of the methods developed in the paper
## with simulated incomplete functional data

source("pred.missfd.R")
source("simul.missfd.R")

## generate random functional data and missing periods

k = 200
gr = ((1:k)-.5)/k # equidistant grid of k points in [0,1]
n = 200
set.seed(1234)
# generate fully observed functions
x.full = simul.fd(n=n,grid=gr)
# generate observation periods
# curve 1 will be missing on (.4,.7), other curves on random subsets
x.obs = rbind((gr<=.4)|(gr>=.7),simul.obs(n=n-1,grid=gr)) # TRUE if observed
# remove missing periods 
x = x.full
x[!x.obs] = NA

# plot the functional data set
matplot(gr,t(x),type="l",lty=1,xlab="",ylab="")

plot of chunk unnamed-chunk-1

# and several curves separately
matplot(gr,t(x[1:5,]),type="l",lty=1,col=2:6,xlab="",ylab="")

plot of chunk unnamed-chunk-1


# summary of missingness
# cross-sectional proportion of available values
plot(gr,colMeans(!is.na(x)),type="l",xlab="",ylab="")

plot of chunk unnamed-chunk-1

# proportion of complete curves
comp = apply(!is.na(x),1,all) # complete curves
mean(comp)
[1] 0.39
# summary of lengths of missing periods
summary(rowSums(is.na(x[!comp,]))/k)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.005   0.085   0.158   0.175   0.260   0.395 

## principal component analysis

mu = mean.missfd(x)
R = var.missfd(x)
eig.R = eigen.missfd(R)
# proportion of explained variability
eig.R$val[1:6]/sum(eig.R$values)
[1] 0.682937 0.192020 0.068241 0.027043 0.010225 0.007325
# first three principal components
phi = eig.R$vectors[,1:3]
matplot(gr,phi,type="l",lty=1,xlab="",ylab="")

plot of chunk unnamed-chunk-1


# predict the principal scores of x[1,] (incomplete function)
# using ridge regularised linear prediction
# the output is a vector of scores, the "details" attribute contains
# the observed and predicted missing part of each score, the prediction
# se, relative error, ridge regularisation parameter alpha determined
# by gcv, corresponding effective degrees of freedom and proportion of
# retained variability
pred.score.missfd(x[1,],phi=phi,x=x)
[1]  0.6264 -0.4744 -0.1464
attr(,"details")
                   [,1]       [,2]       [,3]
score.obs     0.3226482 -0.3723591  0.0718922
score.miss    0.3037609 -0.1020816 -0.2183260
se            0.0586079  0.0227633  0.0505016
relerr        0.1005247  0.0736323  0.2740245
alpha         0.0003268  0.0005587  0.0002495
df.alpha      9.3761972  7.9978004 10.1729172
varprop.alpha 0.9841637  0.9767897  0.9869031

# compare with true scores
# (i.e., values we would get if x1 was fully observed)
pred.score.missfd(x.full[1,],phi=phi,x=x)
[1]  0.6609 -0.4598 -0.1781

# we can compute scores wrt other functions
# (i.e., inner products with functions other than the PCs), e.g.
f = sqrt(gr)
pred.score.missfd(x[1,],phi=f,x=x)
[1] 0.1019
attr(,"details")
                    [,1]
score.obs     -0.1022520
score.miss     0.2041305
se             0.0372662
relerr         0.4880997
alpha          0.0003944
df.alpha       8.8627885
varprop.alpha  0.9819036
# true value
pred.score.missfd(x.full[1,],phi=f,x=x)
[1] 0.12

# diagnostic plot for gcv minimisation
# plot gcv for each score on a grid of alpha-values to check that
# the minimum (red) was correctly found
pred.score.missfd(x[1,],phi=phi,x=x,gcv.plot=TRUE)

plot of chunk unnamed-chunk-1

[1]  0.6264 -0.4744 -0.1464
attr(,"details")
                   [,1]       [,2]       [,3]
score.obs     0.3226482 -0.3723591  0.0718922
score.miss    0.3037609 -0.1020816 -0.2183260
se            0.0586079  0.0227633  0.0505016
relerr        0.1005247  0.0736323  0.2740245
alpha         0.0003268  0.0005587  0.0002495
df.alpha      9.3761972  7.9978004 10.1729172
varprop.alpha 0.9841637  0.9767897  0.9869031

## reconstruction of incomplete curves

# predict the missing part of x[1,]
# using ridge regularised linear prediction
x1.pred = pred.missfd(x[1,],x)
# plot the observed (black) and predicted (red) part
matplot(gr,cbind(x[1,],x1.pred),type="l",lty=1,col=2:3,xlab="",ylab="")
legend("topleft",legend=c("Observed part","Predicted missing part"),
       bty="n",lty=1,col=2:3)

plot of chunk unnamed-chunk-1


# compare the reconstructed missing curve with the originally observed
# values that we deleted
matplot(gr,cbind(x[1,],x1.pred),type="l",lty=1,col=2:3,xlab="",ylab="")
lines(gr[is.na(x[1,])],x.full[1,is.na(x[1,])],col=4)
legend("topleft",legend=c("Observed part","Predicted missing part",
                          "True deleted part"),bty="n",lty=1,col=2:4)

plot of chunk unnamed-chunk-1


# the output x1.pred is a vector with NAs where x[1,] was observed and
# predicted values where x[1,] was missing, with details in attributes
# (predictive covariance operator and standard deviation, relative
# error, regularisation parameter alpha determined by gcv,
# corresponding effective degrees of freedom and proportion of
# retained variability)
str(x1.pred)
 atomic [1:200] NA NA NA NA NA NA NA NA NA NA ...
 - attr(*, "covar")= num [1:200, 1:200] NA NA NA NA NA NA NA NA NA NA ...
 - attr(*, "se")= num [1:200] NA NA NA NA NA NA NA NA NA NA ...
 - attr(*, "relerr")= num 0.169
 - attr(*, "alpha")= num 0.000394
 - attr(*, "df.alpha")= num 8.87
 - attr(*, "varprop.alpha")= num 0.982

# diagnostic plot for gcv minimisation
x1.pred = pred.missfd(x[1,],x,gcv.plot=TRUE)

plot of chunk unnamed-chunk-1


# let us see the effect of alpha
# compare results for alpha selected by gcv with small alpha
# (underregularised, unstable) and big alpha (overregularised,
# biased towards the overall mean)
alpha.gcv = attr(x1.pred,"alpha")
x1.pred.alpha.small = pred.missfd(x[1,],x,alpha=.003*alpha.gcv)
x1.pred.alpha.big = pred.missfd(x[1,],x,alpha=300*alpha.gcv)
matplot(gr,cbind(mu,x[1,],x1.pred,x1.pred.alpha.small,x1.pred.alpha.big),
        type="l",lty=1,col=1:5,xlab="",ylab="")
legend("topleft",legend=c("Mean","Observed","Alpha GCV","Alpha small",
                          "Alpha big"),bty="n",lty=1,col=1:5)

plot of chunk unnamed-chunk-1


# prediction band (95% coverage)
x1.pred.band = gpband(x1.pred,attr(x1.pred,"covar")) # width proportional to pointwise se
# x1.pred.band = gpband(x1.pred,attr(x1.pred,"covar"),h=1) # constant width
# plot the observed and predicted curve and the band
matplot(gr,cbind(x[1,],x1.pred,x1.pred.band),type="l",lty=c(1,1,2,2),col=c(2,3,3,3),xlab="",ylab="")
# add pointwise prediction intervals (95% coverage)
matlines(gr,cbind(x1.pred-1.96*attr(x1.pred,"se"),
                  x1.pred+1.96*attr(x1.pred,"se")),lty=3,col=3)
lines(gr[is.na(x[1,])],x.full[1,is.na(x[1,])],col=4)
legend("topleft",legend=c("Observed","Predicted","Band","Intervals","True deleted"),
       lty=c(1,1,2,3,1),col=c(2,3,3,3,4),bty="n")

plot of chunk unnamed-chunk-1