pigs<-read.csv("pigs_pheno.csv")

#Imagine we explored our variables, cleaned it, harmonized it.
#We did histograms and boxplots and now came to the part where we have to check normality of the data.
#As you remember, it is one of the assumptions for the parametric tests, in our case t-test.

#Let's look at the sample size.
str(pigs)

#Which ways do we have to check the normality?

hist(pigs$Bodyweight, breaks=20) #not normally distributed
qqnorm (pigs$Bodyweight)
qqline(pigs$Bodyweight, col = "red", lwd=3)
ks.test(pigs$Bodyweight, "pnorm", mean(pigs$Bodyweight), sd (pigs$Bodyweight))

library(tidyverse)
hf<- pigs %>%
  filter(Diet=="hf")

chow<- pigs %>%
  filter(Diet=="chow")

hist(hf$Bodyweight, breaks=20)  #is normally distributed
qqnorm(hf$Bodyweight)
qqline(hf$Bodyweight, col = "red", lwd=3)
ks.test(hf$Bodyweight, "pnorm", mean(hf$Bodyweight), sd (hf$Bodyweight))


hist(chow$Bodyweight, breaks=20)  #is normally distributed
qqnorm(chow$Bodyweight)
qqline(chow$Bodyweight, col = "red", lwd=3)
ks.test(chow$Bodyweight, "pnorm", mean(chow$Bodyweight), sd (chow$Bodyweight))

#We fail in one assumption - normality of the data.
#Let's check variance if it's similar or not between the groups.

boxplot(pigs$Bodyweight ~ pigs$Diet) #it's fine 

#But because we failed with normality we have two options:
#1. log transform data and see if we fit the t-test assumptions after and use the t-test that is parametric.
#2. use non-parametric Mann_Whitney. There is no right answer and both ways could be correct. Let's try both.

#Let's use nonparametric Mann_Whitney.
#First, check one of the assumptions about similar distribution shapes of our groups.
par(mfcol = c(1,2))
hist(hf$Bodyweight, breaks=20)
hist(chow$Bodyweight, breaks=20)
par(mfcol = c(1,1))

#The test itself.
wilcox.test(pigs$Bodyweight ~ pigs$Diet)

#There is a statistically significant difference in pigs bodyweight between hf and chow diet (p-value<0.001).

#Now let's try the log transformation.
pigs_transf<-log10(pigs$Bodyweight)

#Check the assumptions now, after transformation.
hist(pigs_transf, breaks=20) #normally distributed
qqnorm (pigs_transf)
qqline(pigs_transf, col = "red", lwd=3)
ks.test(pigs_transf, "pnorm", mean(pigs_transf), sd (pigs_transf))

boxplot(log10(pigs$Bodyweight) ~ pigs$Diet)

#Let's do the t-test. 

t.test(log10(pigs$Bodyweight) ~ pigs$Diet) #significant as well

#Thus, we can say that there is a significant difference in the pigs weight between chow and hf diets, where the average weight is higher in the hf group.

#The interpretation of log transformed data isn't easy. Mean of the log values
#are not the same as the log of the mean, that is we can't just unlog 
#these values, in this case we will deal with geomtric means, not simple means. So, I would go to a mathematician
#or more experienced statistician to help me with that. 

#How I would report the results:
#There is a statistically significant difference in pigs bodyweight between high fat 
#and chow diet (p-value<0.001). 
#If we want to see the relative difference we unlog the
#output (26.91/29.51). It's ratio between the hf and chow diet is 1.097. 
#The relative difference in pigs bodyweight between hf and chow diet is 1.097.

#Between genders
female<- pigs %>%
  filter (Gender=="F")
male<-pigs %>%
  filter (Gender=="M")

hist(female$Bodyweight, breaks=20)  #is not normally distributed
qqnorm(female$Bodyweight)
qqline(female$Bodyweight, col = "red", lwd=3)
ks.test(female$Bodyweight, "pnorm", mean(female$Bodyweight), sd (female$Bodyweight))


hist(male$Bodyweight, breaks=20)  #is not normally distributed
qqnorm(male$Bodyweight)
qqline(male$Bodyweight, col = "red", lwd=3)
ks.test(male$Bodyweight, "pnorm", mean(chow$Bodyweight), sd (chow$Bodyweight))

boxplot(pigs$Bodyweight ~ pigs$Gender)#it's fine

#Total fail of the normality assumptions
#Let's perform the nonparametric Mann Whitney. For this test we have to check 
#similarity of the data spread between the groups.

par(mfcol = c(1,2))
hist(male$Bodyweight, breaks=20)
hist(female$Bodyweight, breaks=20)
par(mfcol = c(1,1))

wilcox.test(pigs$Bodyweight ~ pigs$Gender)