#I part

#1. Upload the data "gender_salary_data.csv" and name it "people".

people<-read.csv("gender_salary_data.csv")

#2. Formulate your research question; which variables will you choose and what hypothesis 
#will you formulate? Formulate your null and alternative hypothesis.
#3. Check the assumptions for the t-test. 
#3.1. Check the sample size.

str(people)

#3.2. Check normality of the numerical variable. Use both graphical and analytical ways.

hist(people$Salary_Euro, breaks=20) #normally distributed
qqnorm (people$Salary_Euro)
qqline(people$Salary_Euro, col = "red", lwd=3)
ks.test(people$Salary_Euro, "pnorm", mean(people$Salary_Euro), sd (people$Salary_Euro))
shapiro.test(people$Salary_Euro)

#3.3. Using the tydiverse package, create two variables: "female" and "male" and 
#save only females and males from the people dataset.

library(tidyverse)
female<- people %>%
  filter(Gender=="Female")

male<- people %>%
  filter(Gender=="Male")

#3.4. Check normality of females and males.

hist(female$Salary_Euro, breaks=20)  #is normally distributed
qqnorm(female$Salary_Euro)
qqline(female$Salary_Euro, col = "red", lwd=3)
ks.test(female$Salary_Euro, "pnorm", mean(female$Salary_Euro), sd (female$Salary_Euro))
shapiro.test(female$Salary_Euro)


hist(male$Salary_Euro, breaks=20)  #is normally distributed
qqnorm(male$Salary_Euro)
qqline(male$Salary_Euro, col = "red", lwd=3)
ks.test(male$Salary_Euro, "pnorm", mean(male$Salary_Euro), sd (male$Salary_Euro))
shapiro.test(male$Salary_Euro)


#3.5. Check another assumption - variance (spread) between the groups.

boxplot(people$Salary_Euro ~ people$Gender) #it's fine 

#3.6. Test your hypothesis using an appropriate statistical test. 

t.test(people$Salary_Euro ~ people$Gender)

#3.7. What conclusion can you make? Formulate the conclusion.

#There is a statistically significant difference in salaries between males and females in Germany (p-value=0.007), with females having a higher average salary than males.

#II part

#1. Upload the data "seafood_mercury_data.csv" and name it "mercury".

mercury<-read.csv("seafood_mercury_data.csv")

#2. Formulate your research question; which variables will you choose and what hypothesis 
#will you formulate? Formulate your null and alternative hypothesis.
#3. Check the assumptions for the t-test. 
#3.1. Check the sample size.

str(mercury)

#3.2. Check normality of the numerical variable. Use both graphical and analytical ways.

hist(mercury$Mercury_Concentration, breaks=20) #not normally distributed
qqnorm (mercury$Mercury_Concentration)
qqline(mercury$Mercury_Concentration, col = "red", lwd=3)
ks.test(mercury$Mercury_Concentration, "pnorm", mean(mercury$Mercury_Concentration), sd (mercury$Mercury_Concentration))
shapiro.test(mercury$Mercury_Concentration)

#3.3. Using the tydiverse package, create two variables: "fish" and "seafood" and 
#save only fish and seafood from the mercury dataset.

library(tidyverse)
fish<- mercury %>%
  filter(Seafood=="Fish")

seafood<- mercury %>%
  filter(Seafood=="Seafood")

#3.4. Check normality of fish and seafood.

hist(fish$Mercury_Concentration, breaks=20)  #is not normally distributed
qqnorm(fish$Mercury_Concentration)
qqline(fish$Mercury_Concentration, col = "red", lwd=3)
ks.test(fish$Mercury_Concentration, "pnorm", mean(fish$Mercury_Concentration), sd (fish$Mercury_Concentration))
shapiro.test(fish$Mercury_Concentration) 

hist(seafood$Mercury_Concentration, breaks=20)  #is not normally distributed
qqnorm(seafood$Mercury_Concentration)
qqline(seafood$Mercury_Concentration, col = "red", lwd=3)
ks.test(seafood$Mercury_Concentration, "pnorm", mean(seafood$Mercury_Concentration), sd (seafood$Mercury_Concentration))
shapiro.test(seafood$Mercury_Concentration) 

#3.5. The data doesn't meet the normality assumption.
#Use the nonparametric test to avoid data transformation.
#Check the nonparametric test assumption - similar distributions between the groups.

par(mfcol = c(1,2))
hist(seafood$Mercury_Concentration, breaks=20)
hist(fish$Mercury_Concentration, breaks=20)
par(mfcol = c(1,1))

#3.6. Test your hypothesis using an appropriate statistical test. 
wilcox.test(mercury$Mercury_Concentration ~ mercury$Seafood)

#3.7. What conclusion can you make? Formulate the conclusion.

#There is no statistically significant difference in mercury levels between fish and seafood (p-value=0.697).