# SQL exercises for Data Mining II 28.11.2016 # load data dir <- "~/Code/universities/SNA/train.csv" df <- read.csv(dir) # loading package to the SQL library(sqldf) sqldf("SELECT name, sex, age FROM df") sqldf(' SELECT survived, count(*) as "No of people" FROM df GROUP BY 1 ') sqldf(' SELECT survived, count(*) as "No of people" FROM df GROUP BY 1 ') sqldf(' SELECT survived, sex, count(*) as "No of people" FROM df GROUP BY 1,2 ') #with percentage # everything is the same as in SQL, even comments or CAST functions sqldf(' SELECT survived ,sex ,no_people / m_f_count as percentage_int --,CAST(no_people as FLOAT) / m_f_count as percentage FROM ( SELECT A.survived ,A.sex ,A.no_people ,B.m_f_count FROM ( SELECT survived, sex, count(*) as no_people FROM df GROUP BY 1,2 ) A INNER JOIN ( SELECT sex, count(*) as m_f_count FROM df GROUP BY 1 ) B ON A.sex = B.sex ) inner_query ') #============================================================ # 1. # Find average fare, that was paid in general, by men and by women. # 2. # What are differences in the average price of tickets for 1st, 2nd and 3rd class # 3. # How many people Embarked in Cherbourg, Queenstown or Southampton # 4. # Were older people in general willing to pay for higher class? # For this example, person is considered older if he is over 55 # 5. # How many people were 18 and younger and without any family relatives on the deck? # 6. # How many people are missing information about their age? # 7. # What is the difference between average price of ticket for people who surivved # and people who don't? # 8. # what is the percentage of revenue made by men and women separately? # 9. # what is the percentage of revenue made by 1st class tickets? # 10. # How many people had at least 1 sibling on the deck, and what was the percentage # of those people whi survived?