#I part #1. Upload the data "gender_salary_data.csv" and name it "people". people<-read.csv("gender_salary_data.csv") #2. Formulate your research question; which variables will you choose and what hypothesis #will you formulate? Formulate your null and alternative hypothesis. #3. Check the assumptions for the t-test. #3.1. Check the sample size. str(people) #3.2. Check normality of the numerical variable. Use both graphical and analytical ways. hist(people$Salary_Euro, breaks=20) #normally distributed qqnorm (people$Salary_Euro) qqline(people$Salary_Euro, col = "red", lwd=3) ks.test(people$Salary_Euro, "pnorm", mean(people$Salary_Euro), sd (people$Salary_Euro)) shapiro.test(people$Salary_Euro) #3.3. Using the tydiverse package, create two variables: "female" and "male" and #save only females and males from the people dataset. library(tidyverse) female<- people %>% filter(Gender=="Female") male<- people %>% filter(Gender=="Male") #3.4. Check normality of females and males. hist(female$Salary_Euro, breaks=20) #is normally distributed qqnorm(female$Salary_Euro) qqline(female$Salary_Euro, col = "red", lwd=3) ks.test(female$Salary_Euro, "pnorm", mean(female$Salary_Euro), sd (female$Salary_Euro)) shapiro.test(female$Salary_Euro) hist(male$Salary_Euro, breaks=20) #is normally distributed qqnorm(male$Salary_Euro) qqline(male$Salary_Euro, col = "red", lwd=3) ks.test(male$Salary_Euro, "pnorm", mean(male$Salary_Euro), sd (male$Salary_Euro)) shapiro.test(male$Salary_Euro) #3.5. Check another assumption - variance (spread) between the groups. boxplot(people$Salary_Euro ~ people$Gender) #it's fine #3.6. Test your hypothesis using an appropriate statistical test. t.test(people$Salary_Euro ~ people$Gender) #3.7. What conclusion can you make? Formulate the conclusion. #There is a statistically significant difference in salaries between males and females in Germany (p-value=0.007), with females having a higher average salary than males. #II part #1. Upload the data "seafood_mercury_data.csv" and name it "mercury". mercury<-read.csv("seafood_mercury_data.csv") #2. Formulate your research question; which variables will you choose and what hypothesis #will you formulate? Formulate your null and alternative hypothesis. #3. Check the assumptions for the t-test. #3.1. Check the sample size. str(mercury) #3.2. Check normality of the numerical variable. Use both graphical and analytical ways. hist(mercury$Mercury_Concentration, breaks=20) #not normally distributed qqnorm (mercury$Mercury_Concentration) qqline(mercury$Mercury_Concentration, col = "red", lwd=3) ks.test(mercury$Mercury_Concentration, "pnorm", mean(mercury$Mercury_Concentration), sd (mercury$Mercury_Concentration)) shapiro.test(mercury$Mercury_Concentration) #3.3. Using the tydiverse package, create two variables: "fish" and "seafood" and #save only fish and seafood from the mercury dataset. library(tidyverse) fish<- mercury %>% filter(Seafood=="Fish") seafood<- mercury %>% filter(Seafood=="Seafood") #3.4. Check normality of fish and seafood. hist(fish$Mercury_Concentration, breaks=20) #is not normally distributed qqnorm(fish$Mercury_Concentration) qqline(fish$Mercury_Concentration, col = "red", lwd=3) ks.test(fish$Mercury_Concentration, "pnorm", mean(fish$Mercury_Concentration), sd (fish$Mercury_Concentration)) shapiro.test(fish$Mercury_Concentration) hist(seafood$Mercury_Concentration, breaks=20) #is not normally distributed qqnorm(seafood$Mercury_Concentration) qqline(seafood$Mercury_Concentration, col = "red", lwd=3) ks.test(seafood$Mercury_Concentration, "pnorm", mean(seafood$Mercury_Concentration), sd (seafood$Mercury_Concentration)) shapiro.test(seafood$Mercury_Concentration) #3.5. The data doesn't meet the normality assumption. #Use the nonparametric test to avoid data transformation. #Check the nonparametric test assumption - similar distributions between the groups. par(mfcol = c(1,2)) hist(seafood$Mercury_Concentration, breaks=20) hist(fish$Mercury_Concentration, breaks=20) par(mfcol = c(1,1)) #3.6. Test your hypothesis using an appropriate statistical test. wilcox.test(mercury$Mercury_Concentration ~ mercury$Seafood) #3.7. What conclusion can you make? Formulate the conclusion. #There is no statistically significant difference in mercury levels between fish and seafood (p-value=0.697).