# SQL exercises for Data Mining II 2.11.2016 # load data dir <- "~/Code/GitHub/jurajkapasny/SNA/train.csv" df <- read.csv(dir) # loading package to the SQL library(sqldf) sqldf("SELECT name, sex, age FROM df") sqldf(' SELECT survived, count(*) as "No of people" FROM df GROUP BY 1 ') sqldf(' SELECT survived, count(*) as "No of people" FROM df GROUP BY 1 ') sqldf(' SELECT survived, sex, count(*) as "No of people" FROM df GROUP BY 1,2 ') #with percentage # everything is the same as in SQL, even comments or CAST functions sqldf(' SELECT survived ,sex ,no_people / m_f_count as percentage_int --,CAST(no_people as FLOAT) / m_f_count as percentage FROM ( SELECT A.survived ,A.sex ,A.no_people ,B.m_f_count FROM ( SELECT survived, sex, count(*) as no_people FROM df GROUP BY 1,2 ) A INNER JOIN ( SELECT sex, count(*) as m_f_count FROM df GROUP BY 1 ) B ON A.sex = B.sex ) inner_query ') #============================================================ # 1. # Find average fare, that was paid in general, by men and by women. sqldf(' SELECT sex, avg(fare) as avg_fare FROM df GROUP BY 1 ') sqldf(' SELECT max(avg_fare) - min(avg_fare) FROM ( SELECT sex, avg(fare) as avg_fare FROM df GROUP BY 1 ) as inner_query ') # 2. # What are differences in the average price of tickets for 1st, 2nd and 3rd class sqldf(' SELECT Pclass, avg(fare) as avg_fare FROM df GROUP BY 1 ') # 3. # How many people Embarked in Cherbourg, Queenstown or Southampton sqldf(' SELECT Embarked, count(*) as no_people FROM df GROUP BY 1 ') # 4. # Were older people in general willing to pay for higher class? # For this example, person is considered older if he is over 55 sqldf(' SELECT Pclass, count(*) as no_people FROM df WHERE age > 55 GROUP BY 1 ') # 5. # How many people were 18 and younger and without any family relatives on the deck? # How many of them died? sqldf(' SELECT count(*) as no_people FROM df WHERE age <= 18 AND SibSp = 0 ') sqldf(' SELECT survived, count(*) as no_people FROM df WHERE age <= 18 AND SibSp = 0 GROUP BY 1 ') # 6. # How many people are missing information about their age? sqldf(' SELECT count(*) as no_people FROM df WHERE age is null ') # 7. # What is the difference between average price of ticket for people who surivved # and people who don't? sqldf(' SELECT max(avg_fare) - min(avg_fare) FROM ( SELECT survived, avg(fare) as avg_fare FROM df GROUP BY 1 ) as inner_query ') # bad point is we don't know if maximum is for survived = 1 or 0 # 2nd solution sqldf(' SELECT A.fare_survived ,B.fare_died ,A.fare_survived - B.fare_died as diff FROM ( SELECT avg(fare) as fare_survived FROM df WHERE survived = 1 ) as A CROSS JOIN ( SELECT avg(fare) as fare_died FROM df WHERE survived = 0 ) as B ') # 8. # what is the percentage of revenue made by men and women separately? sqldf(' SELECT A.sex ,A.revenue_sex ,B.revenue ,A.revenue_sex / B.revenue as revenue_percentage FROM ( SELECT sex, sum(fare) as revenue_sex FROM df GROUP BY 1 ) as A CROSS JOIN ( SELECT sum(fare) as revenue FROM df ) as B ') # combined from results of number 1 -> female paid much more per ticket # however bigger number of men were on the deck # 9. # what is the percentage of revenue made by 1st class tickets? sqldf(' SELECT A.revenue_1 ,B.revenue ,A.revenue_1 / B.revenue as percentage FROM ( SELECT SUM(fare) as revenue_1 FROM df WHERE pclass = 1 ) A CROSS JOIN ( SELECT sum(fare) as revenue FROM df ) B ') sqldf(' SELECT Pclass, count(*) as no_tickets FROM df GROUP BY 1 ') # 10. # How many people had at least 1 sibling on the deck, and what was the percentage # of those people who survived? # watch out for type of the variables sqldf(' SELECT count(*) as no_people FROM df WHERE sibsp >= 1 ') sqldf(' SELECT A.survived ,A.no_people ,B.no_people as people_with_siblings ,CAST(A.no_people as float) / B.no_people as percentage FROM ( SELECT survived, count(*) as no_people FROM df WHERE sibsp >= 1 GROUP BY 1 ) A CROSS JOIN ( SELECT count(*) as no_people FROM df WHERE sibsp >= 1 ) B ')