##%######################################################%##
#                                                          #
####                        SEM                         ####
#                                                          #
##%######################################################%##


# Tutorial: http://lavaan.ugent.be/tutorial/
# Manuaů: http://users.ugent.be/~yrosseel/lavaan/lavaanIntroduction.pdf
# Lavaan cheat sheet: http://jeromyanglim.tumblr.com/post/33556941601/lavaan-cheat-sheet

# Preface -----------------------------------------------------------------

# Balíčky
library(tidyverse)
library(lavaan) 
library(psych)
library(lavaanPlot)
library(corrplot)
library(semTools)


### Import dat
df <- read.csv2("data/NfCCsousyp.csv")
glimpse(df)

df$zdroj %>% unique()

pretests <- df %>% 
  filter(zdroj %in% c("pretest2015", "pretest2016"))

autonomie <- df %>% 
  filter(zdroj %in% c("autonomie_P1", "autonomie_P9"))

# Funkce pro tvorbu vektoru položek
var_range <- function(prefix, range, suffix = "", width = NULL) {
  range <- sprintf(
    paste0("%0.", width, "d"),
    range
  )
  paste0(prefix, range, suffix)
}

# Jména položek
items <- var_range("nfc", 1:15, width = 2)
items

# Znění položek
nfc <- readxl::read_excel("data/NFC.xlsx")
nfc %>% head(15)
  
### Pomocné funkce
# Funkce na jednoduchý export výstupů do schránky
export <- function(x = .Last.value, row.names = FALSE) {
   write.table(x, "clipboard", sep = "\t", row.names = FALSE, dec = ",")
}

# Pomocná funkce pro fitování CFA
fit_cfa <- function(data, model, ...) {
  cfa(model = model, data = data,
      std.lv = TRUE, missing = "FIML", estimator = "MLR", ...)
}

# Pomocná funkce pro souhrn výsledků CFA
cfa_summary <- function(fit) {
   summary(fit, ci = TRUE, fit = TRUE, std = TRUE)
}

# Pomocná funkce pro modifikační indexy
mi <- function(fit) {
   modificationindices(fit, sort. = TRUE) %>% 
      as_tibble()
}

# Pomocná funkce pro indexy shody s daty
fit_measures <- function(fit) {
   out <- fitMeasures(
      fit, 
      fit.measures = c("chisq", "chisq.scaled", "df", "cfi", "cfi.scaled", 
                       "rmsea", "rmsea.scaled", "srmr")) %>% 
      as.list()
   out$AIC <- AIC(fit)
   out$BIC <- BIC(fit)
   out
}

# Popisné statistiky položek -----------------------------------------------

# Můžeme zkontrolovat šikmnost, špičatost i obtížnost
pretests %>% 
  select(all_of(items)) %>% 
  psych::describe() %>% 
  mutate(p = (mean-1) / 4)

# Podívat se na histogramy rozložení odpovědí
pretests %>% 
   select(all_of(items)) %>% 
   pivot_longer(everything()) %>% 
   ggplot(aes(value)) +
   geom_bar() +
   facet_wrap(~name)

# Vyjet si korelace
rmat <- pretests %>% 
   select(all_of(items)) %>% 
   cor(use = "pairwise")

corrplot::corrplot.mixed(rmat, lower.col = "black",
                         order = 'hclust')

nfc %>% View()

# SMC: Squared multiple correlations
# Rozptyl vysvětlený všemi ostatními položkami
# Položky s nízkými hodnotami SMC moc společného rozptylu s ostatními položkami
# Nesdílejí
r2 <- pretests %>% 
   select(all_of(items)) %>% 
   smc() %>% 
   sort(decreasing = TRUE)

plot(r2, type = "b")
text(r2, names(r2), adj = c(0.5,-0.5))

# Clusterová analýza položek
# Vytváží clustry tak, aby jejich vnitřní konzistence byla co největší
i_clusters <- pretests %>% 
   select(all_of(items)) %>% 
   iclust(output = 3, nclusters = 5)
i_clusters$clusters

# Více statistik pro položky
pretests %>% 
  select(all_of(items)) %>% 
  psych::alpha()

# Specifikace modelů ---------------------------------------------------------

# Model 1 - jednoduchý, jednodimenzionální
m_1f <- "
NFC =~ nfc01 + nfc02 + nfc03 + nfc04 + nfc05 + nfc06 + nfc07 + nfc08 + 
       nfc09 + nfc10 + nfc11 + nfc12 + nfc13 + nfc14 + nfc15

"
#Model 2 - jednoduchý, pětifaktorový
m_5f_a <- "
NfO =~ nfc01 + nfc02 + nfc03  # need for order
PRE =~ nfc04 + nfc05 + nfc06  # predictability
DEC =~ nfc07 + nfc08 + nfc09  # decisiveness
DwA =~ nfc10 + nfc11 + nfc12  # discomfort with ambiguity
CLM =~ nfc13 + nfc14 + nfc15  # closed-mindedness
"

#Model 3 - pětifaktorový s korelací reziduí
m_5f_b <- "
NfO =~ nfc01 + nfc02 + nfc03  # need for order
PRE =~ nfc04 + nfc05 + nfc06  # predictability
DEC =~ nfc07 + nfc08 + nfc09  # decisiveness
DwA =~ nfc10 + nfc11 + nfc12  # discomfort with ambiguity
CLM =~ nfc13 + nfc14 + nfc15 +  # close-mindedness
       nfc12 # crossloading: nemám rád, když  nějaký výrok může znamenat víc věcí
       
nfc11 ~~ nfc12
"

#Model 4 - jednoduchý, čtyřfaktorový
m_4f_a <-"
NfO  =~ nfc01 + nfc02 + nfc03    # need for order
PDwA =~ nfc04 + nfc05 + nfc06 +  # predictability + discomfort with ambiguity
        nfc10 + nfc11 + nfc12 
DEC  =~ nfc07 + nfc08 + nfc09    # decisiveness
CLM  =~ nfc13 + nfc14 + nfc15    # close-mindedness
"

#Model 4a - čtyřfaktorový s komplikacemi
m_4f_b <- "
NfO  =~ nfc01 + nfc02 + nfc03    # need for order
PDwA =~ nfc04 + nfc05 + nfc06 +  # predictability + discomfort with ambiguity
        nfc10 + nfc11 + nfc12 
DEC  =~ nfc07 + nfc08 + nfc09    # decisiveness
CLM  =~ nfc13 + nfc14 + nfc15 + # close-mindedness
        nfc12
        
nfc11 ~~ nfc12
"

#Model 5 - bifaktor, pětifaktorový - ortogonalita faktorů je zajištěna voláním cfa
m_bif <- "
G =~ nfc01 + nfc02 + nfc03 + nfc04 + nfc05 + nfc06 + nfc07 + nfc08 + 
     nfc09 + nfc10 + nfc11 + nfc12 + nfc13 + nfc14 + nfc15

# nfc10 as a reference item (loading only on the general factor)

NfO =~ nfc01 + nfc02 + nfc03  # need for order
PRE =~ nfc04 + nfc05 + nfc06  # predictability
DEC =~ nfc07 + nfc08 + nfc09  # decisiveness
CLM =~ nfc13 + nfc14 + nfc15 + # closed-mindedness
       nfc12 # crossloading

# Residual covariance
nfc11 ~~ nfc12

# All factors orthogonal
G   ~~ 0*NfO + 0*PRE + 0*DEC + 0*CLM
NfO ~~ 0*PRE + 0*DEC + 0*CLM
PRE ~~ 0*DEC + 0*CLM
DEC ~~ 0*CLM
"

# Odhad modelu ---------------------------------------------------------

# Můžeme si vytvořit list se všemi modely 
m_list <- list(
   one_fct = m_1f,
   four_fct = m_4f_a,
   four_fct_tuned = m_4f_b,
   five_fct = m_5f_a,
   five_fct_tuned = m_5f_b,
   bifac = m_bif
)

# Fitování všech modelů
fits_mlr <- m_list %>% 
   map(~fit_cfa(data = pretests, model = .x))

# Přehled fitů
fit_indices_mlr <- fits_mlr %>% 
  map_df(fit_measures) %>% 
  mutate(model = names(m_list)) %>% 
  select(model, everything())
fit_indices_mlr

# Jednofaktorový má hodně špatný fit
cfa_summary(fits_mlr[["one_fct"]])

# Pětifaktorový má dobrý fit,
# Ale podívejte se na korelaci mezi faktory
cfa_summary(fits_mlr[["five_fct_tuned"]])

# Bifaktorový má nejlepší fit, ale je zároveň nesložitější
cfa_summary(fits_mlr[["bifac"]])

# Kdybychom chtěli zacházet s proměnnými jako s ordinálními
# a použít polychorické korelace
fits_dwls <- m_list %>% 
   map(
      ~cfa(.x, data = pretests, ordered = TRUE, std.lv = TRUE,
           missing = "pairwise")
   )

# Protože odhadované korelace bodou silnější
# je zpravidla fit dle CFI lepší, zatímco fit dle RMSEA a SRMR horší
fit_indices_dwls <- fits_dwls %>% 
  map_df(fit_measures) %>%
  mutate(model = names(m_list)) %>% 
  select(model, everything())
fit_indices_dwls

# Ale neignorujte varování
cfa_summary(fits_dwls[["five_fct_tuned"]]) # Podívejte se na korelace mezi faktory
cfa_summary(fits_dwls[["bifac"]]) # Podívejte se na reziduální rozptyly


# Cross-validation --------------------------------------------------------
# Složitější modely často vedou k over-fittingu
# Model dobře fituje na konkrétní vzorek
# ale jeho generalizovatelnost (populační fit nebo fit u jiných vzorků)
# je horší než u "parsimonnějších" modelů

### Použít odhady fitovaného modelu jako startovní hodnoty
# a startovní hodnoty jako konečné odhady
fit_imposed1 <- imposeStart(fits_mlr[["four_fct_tuned"]], 
                            fit_cfa(m_4f_a, data = autonomie, do.fit = FALSE))
fit_imposed2 <- imposeStart(fits_mlr[["bifac"]], 
                            fit_cfa(m_bif, data = autonomie, do.fit = FALSE))
# Porovant SRMR
lavResiduals(fit_imposed1)
lavResiduals(fit_imposed2)


# Výsledky ----------------------------------------------------------------

# Finální model, který chceme použít 
fit <- fits_mlr[["four_fct_tuned"]]

# Stručný souhrn výsledků.
cfa_summary(fit)

#Přehled odhadnutých parametrů - nepřehledný, technický.
parameterestimates(fit, level = .99)

#Všechny indikátory shody počítané lavaanem.
fitmeasures(fit)

# Jen vybrané, které jsme si definovali ve custom funkci
fit_measures(fit)

# Shrnutí fitů
fit_indices_mlr
fit_indices_dwls

# Použití jednoduššího modelu  -------------------------------------

# Co kdybychom pro dosažení parsimonie zafixovali nastavili
# u všech položek shodné práhy a povolili pouze intercepty?

m_4f_c <- "
NfO  =~ nfc01 + nfc02 + nfc03    # need for order
PDwA =~ nfc04 + nfc05 + nfc06 +  # predictability + discomfort with ambiguity
        nfc10 + nfc11 + nfc12 
DEC  =~ nfc07 + nfc08 + nfc09    # decisiveness
CLM  =~ nfc13 + nfc14 + nfc15 +  # close-mindedness
        nfc12
        
# Fixed thresholds across items
nfc01 | a*t1 + b*t2 + c*t3 + d*t4
nfc02 | a*t1 + b*t2 + c*t3 + d*t4
nfc03 | a*t1 + b*t2 + c*t3 + d*t4
nfc04 | a*t1 + b*t2 + c*t3 + d*t4
nfc05 | a*t1 + b*t2 + c*t3 + d*t4
nfc06 | a*t1 + b*t2 + c*t3 + d*t4
nfc07 | a*t1 + b*t2 + c*t3 + d*t4
nfc08 | a*t1 + b*t2 + c*t3 + d*t4
nfc09 | a*t1 + b*t2 + c*t3 + d*t4
nfc10 | a*t1 + b*t2 + c*t3 + d*t4
nfc11 | a*t1 + b*t2 + c*t3 + d*t4
nfc12 | a*t1 + b*t2 + c*t3 + d*t4
nfc13 | a*t1 + b*t2 + c*t3 + d*t4
nfc14 | a*t1 + b*t2 + c*t3 + d*t4
nfc15 | a*t1 + b*t2 + c*t3 + d*t4

# Free latent response intercepts (exept for one item)
nfc01 ~ 0*1
nfc02 ~ NA*1
nfc03 ~ NA*1
nfc04 ~ NA*1
nfc05 ~ NA*1
nfc06 ~ NA*1
nfc07 ~ NA*1
nfc08 ~ NA*1
nfc09 ~ NA*1
nfc10 ~ NA*1
nfc11 ~ NA*1
nfc12 ~ NA*1
nfc13 ~ NA*1
nfc14 ~ NA*1
nfc15 ~ NA*1
"

fit_equal_thr <- cfa(m_4f_c, data = pretests,
                     effect.coding = TRUE,
                     ordered = TRUE, missing = "pairwise")

cfa_summary(fit_equal_thr)


# SEM Diagram ---------------------------------------------------------------

# Pomocí balíčku lavaanPlot
lavaanPlot(model = fit, coefs = TRUE, covs = TRUE, stand = TRUE,
                       graph_options = list(layout = "dot"))

# Pomocí balíčku semPlot
semPlot::semPaths(fit,
                  whatLabels = "std",
                  edge.label.position = .65,
                  intercepts = FALSE,
                  residuals = TRUE,
                  style = "lisrel",
                  sizeLat = 6,
                  covAtResiduals = FALSE,
                  edge.label.cex = 1,
                  layout = "circle")

semPlot::semPaths(fit,
                  whatLabels = "std",
                  edge.color = "black",
                  edge.label.position = .5,
                  label.scale = FALSE,
                  intercepts = FALSE,
                  residuals = TRUE,
                  style = "lisrel",
                  sizeLat = 6,
                  covAtResiduals = FALSE,
                  edge.label.cex = 1,
                  layout = "tree")

# Detaily řešení extrahované inspectem ------------------------------------

# Modelem implikovaná variačně-kovarianční matice a vektor průměrů.
fitted(fit)

# Vstupní kovarianční matice a vektor průměrů
inspect(fit, 'sampstat')


# Koeficienty modelu
inspect(fit, what = "est")  # vše
inspect(fit, what = "est")$lambda  # faktorové náboje
inspect(fit, what = "est")$theta   # residuální variance/kovariance pozorovaných proměnných
inspect(fit, what = "est")$psi     # kovariance/korelace faktorů


# Residuální matice.
lavResiduals(fit, type = "raw") # variance/kovariance
lavResiduals(fit, type = "cor") # korelace
lavResiduals(fit, type = "cor.bentler") # Z-test

# Můžeme se podívat na to, které pozorované vztahy model špatně reprodukuje
# Nejlépe graficky
colfunc <- colorRampPalette(c("red", "white", "blue"))
colfunc(20)

lavResiduals(fit, type = "cor")$cov %>% 
  corrplot::corrplot.mixed(upper = "color", lower = 'pie',
                           upper.col = colfunc(30))
lavResiduals(fit, type = "cor")$cov %>% 
  corrplot::corrplot.mixed(upper = "color", lower = 'number',
                           lower.col = "black",
                           upper.col = colfunc(30))

# Histogram standardizovaných reziduí
lavResiduals(fit, type = "cor.bentler")$cov.z %>% 
  qplot(binwidth = 0.5)

#Nahlédnutí do objektu lavaan. Defaultně dá přehled parametrů.
inspect(fit) # Volné parametry (nulu mají parametry zafixované na 0)
inspect(fit, what = "patterns") # missing data patterns
inspect(fit, what = "std.nox") # Kompletně standardizované řešení

?inspect # Plno dalších voleb, co si můžeme zobrazit
  
# Modifikační indexy -----------------------------------------------------
mi(fit)

# Porovnávání modelů ------------------------------------------------------
names(fits_mlr)

# LRT chisquare test pro vnořené modely.
lavTestLRT(fits_mlr[["four_fct"]], fits_mlr[["four_fct_tuned"]])
lavTestLRT(fits_mlr[["five_fct"]], fits_mlr[["five_fct_tuned"]])

# Non-nested modely často porovnáváme podle AIC nebo BIC
fits_mlr %>% 
   map_df(fit_measures)

# Neparameticky pomocí lavaan::bootstrapLRT
# Bude trvat docela dlouho i při neparalelním zpracování
bootstrapLRT(h0 = fits_mlr[[2]], h1 = fits_mlr[[3]], 
             type = "ordinary",
             R = 1000, iseed = 2021,
             parallel = "snow", ncpus = 6,              
             verbose = FALSE, return.LRT = TRUE)


# Reliability -------------------------------------------------------------

# semTools počítají reliabilitu - McDonald omega
semTools::reliability(fit, return.total = TRUE)
semTools::reliability(fits_dwls[["four_fct_tuned"]], return.total = TRUE)
semTools::reliability(fits_mlr[["bifac"]], return.total = TRUE)

# Omega1 assumes that all covariance is accounted for by the common factor model, 
# with no residual covariances between observed variables. This assumption is 
# the basis for computing total variance (the denominator) using only the 
# loadings and the covariances among factors.

# Omega2 relaxes the "no residual covariance" assumption, and so the denominator 
# is simply the model-implied covariance matrix of the observed variables.

# Omega3, which is intended for models with constraints on relations between 
# factors, replaces omega2's model-implied covariance matrix with the observed 
# covariance matrix. If relations between factors are saturated, then there is 
# no room for the structural model to be inconsistent with the data--this is 
# what omega2 assumes. But if relations between factors are structured, then 
# there may be misfit between model and data, which may have implications for 
# estimated reliability. Omega3 allows for this possibility.

# Assessing normality --------------------------------------------------

# Normalita jen zřídka naplněna, multivariační zvlášť.
pretests %>% 
   select(all_of(items)) %>% 
   mardia() # Mardia's test

# Henze-Zirkler test a Robustní Mahalanobisovy vzdálenostii
pretests %>% 
   select(all_of(items)) %>% 
   MVN::mvn(multivariateOutlierMethod = "quan",
            showOutliers = TRUE)

# Measurement invariance ------------------------------------------------

# * Items treated as continuous ---------------------------------------------

# Base model
m <- "
NfO  =~ nfc01 + nfc02 + nfc03    # need for order
PDwA =~ nfc04 + nfc05 + nfc06 +  # predictability + discomfort with ambiguity
        nfc10 + nfc11 + nfc12 
DEC  =~ nfc07 + nfc08 + nfc09    # decisiveness
CLM  =~ nfc13 + nfc14 + nfc15 + # close-mindedness
        nfc12
"

# List of constraints
# Obvykle nejdříve constrainujeme náboje, pak intercepty a případně 
# i residuální rozptyly

# Pro srovnání průměrů stačí dosáhnout alespoň (parciální) skalární invariance
constrain <- list(
   configural = "",
   metric = "loadings",
   metric_partial = "loadings",
   scalar = c("loadings", "intercepts"),
   scalar_partial =  c("loadings", "intercepts"),
   strict = c("loadings", "intercepts", "residuals"),
   strict_partial = c("loadings", "intercepts", "residuals")
)

# List of free parameters
free <- list(
   configural = "",
   metric = "",
   metric_partial = "CLM =~ nfc12",
   scalar = "",
   scalar_partial =  c("CLM =~ nfc12", "nfc02 ~ 1"),
   strict = "",
   strict_partial = c("CLM =~ nfc12", "nfc02 ~ 1", "nfc02~~nfc02")
)

# Fit all
fits_mi <-  map2(
   constrain, free,
   ~fit_cfa(pretests, model = m, group = "student_psy",
            group.equal = .x, group.partial = .y)
   )

# Fit measures
fits_mi %>% 
   map_df(fit_measures) %>% 
   mutate(model = names(fits_mi)) %>% 
   select(model, everything())

# LRT tests
lavTestLRT(fits_mi[["configural"]], fits_mi[["metric"]])
lavTestLRT(fits_mi[["metric"]], fits_mi[["scalar"]])
lavTestLRT(fits_mi[["scalar"]], fits_mi[["strict"]])

# Score test (or Lagrange Multiplier test) for releasing one or more fixed or 
# constrained parameters in model
score_test <- lavTestScore(fits_mi[["scalar"]])
score_test$uni %>% 
   arrange(desc(X2))

parameterEstimates(fits_mi[["scalar"]]) %>% 
   filter(label %in% c(".p52.", ".p55.", ".p43."))

# * Items treated as ordinal ---------------------------------------------

# Configural invariance
m <- "
NfO  =~ nfc01 + nfc02 + nfc03    # need for order
PDwA =~ nfc04 + nfc05 + nfc06 +  # predictability + discomfort with ambiguity
        nfc10 + nfc11 + nfc12 
DEC  =~ nfc07 + nfc08 + nfc09    # decisiveness
CLM  =~ nfc13 + nfc14 + nfc15 + # close-mindedness
        nfc12
        
nfc11 ~~ nfc12
"

# Při ordiální CFA se nově doporučuje constrainovat nejprve 
# práhy, pak náboje a nakonec intercepty
constrain <- list(
   configural = "configural",
   thresholds = "thresholds",
   loadings = c("thresholds", "loadings"),
   intercepts = c("thresholds", "loadings", "intercepts"),
   means = c("thresholds", "loadings", "intercepts", "means")
)

# Správné definice modelů usnadňujě funkce measEq.syntax() z balíčku semTools
measEq.syntax(configural.model = m,
              data = pretests,
              ordered = TRUE,
              parameterization = "delta", 
              ID.fac = "std.lv",
              ID.cat = "Wu.Estabrook.2016",
              group = "student_psy",
              group.equal = "thresholds") %>% 
  as.character() %>% 
  cat()

# Necháme si specifikace modelů vygenerovat všechny
m_list <- constrain %>% 
  map(
      ~measEq.syntax(configural.model = m,
                     data = pretests,
                     ordered = TRUE,
                     parameterization = "delta", 
                     ID.fac = "std.lv",
                     ID.cat = "Wu.Estabrook.2016",
                     group = "student_psy",
                     group.equal = .x)
   ) %>% 
  map(as.character) %>% 
  set_names(names(constrain))

# Fitování všech definovaných modelů
fits_mi_cat <- m_list %>% 
   map(
      ~cfa(model = .x, data = pretests, group = "student_psy",
           ordered = TRUE, missing = "pairwise")
   )

fits_mi_cat %>% 
   map_df(fit_measures) %>% 
   mutate(model = names(m_list)) %>% 
   select(model, everything())

i <- 1
while (i < 5) {
  names(fits_mi_cat[c(i, i+1)]) %>% print()
  lavTestLRT(fits_mi_cat[[i]], fits_mi_cat[[i + 1]]) %>% print()
  
  i <- i + 1
}

# lavaan zvládne i EFA ----------------------------------------------------

unrotated <- efaUnrotate(pretests, nf = 5,
                         varList = items, estimator = "MLR",
                         missing = "ML")
fit_measures(unrotated)

# Šikmá rotace geomin
efa_geomin <- oblqRotate(unrotated, method = "geomin")
efa_geomin

efa_geomin@loading %>% round(2)

# Target rotation - Exploratory SEM
library(GPArotation)

target <- matrix(0, nrow = 15, ncol = 5)
target

target[1:3, 1] <- NA
target[4:6, 2] <- NA
target[7:9, 3] <- NA
target[10:12, 4] <- NA
target[13:15, 5] <- NA

colnames(target) <- c("NfO", "PRE", "DEC", "DwA", "CLM")

efa_target <- funRotate(unrotated, fun = "targetQ", Target = target)
efa_target
efa_target@loading %>% round(2)