library(DBI)
Loading required package: methods
con = dbConnect(RPostgreSQL::PostgreSQL(),
dbname = "nycflights13",
user = "csor",
password = "csor")
mtcars
into the database from RdbWriteTable(con, "mtcars", mtcars)
[1] TRUE
Average mileage by number of cylinders:
SELECT cyl, am, avg(mpg) FROM mtcars
GROUP BY cyl, am;
Total weight of the listed cars per manufacturer:
SELECT left("row.names", strpos("row.names", ' ') - 1) AS manufacturer,
sum(wt)
FROM mtcars
GROUP BY manufacturer;
Notice the choice of R string quotes and the escaped single quotes in q2
:
q1 <- 'SELECT cyl, am, avg(mpg) FROM mtcars GROUP BY cyl, am;'
dbGetQuery(con, q1)
q2 <- '
SELECT left("row.names", strpos("row.names", \' \') - 1) AS manufacturer,
sum(wt)
FROM mtcars
GROUP BY manufacturer;'
dbGetQuery(con, q2)
query_avg
query_avg <- function(con, name, n, m = NULL) {
fields <- dbListFields(con, name)
enquote <- function(x) paste0('"', x, '"')
paste('SELECT',
if (!is.null(m)) paste0(enquote(fields[m]), ','),
'avg(', enquote(fields[n]), ') FROM', enquote(name),
if (!is.null(m)) paste('GROUP BY', enquote(fields[m])), ';')
}
query_avg
Expectation:
q1
should result in an error, not because of invalid syntax, but because the column number 1 is the automatically created row.names
column - a string variable, for which an average cannot be calculated. You might expect mpg
to be the first column, as it is for the mtcars
data frame, but row names do not exist in SQL.q2
should return the average disp
grouped by vs
.q1 <- query_avg(con, "mtcars", 1)
dbGetQuery(con, q1)
Warning in postgresqlQuickSQL(conn, statement, ...): Could not create
execute: SELECT avg( "row.names" ) FROM "mtcars" ;
NULL
q2 <- query_avg(con, "mtcars", 4, 9)
dbGetQuery(con, q2)
mtcars
from the database againdbRemoveTable(con, "mtcars")
[1] TRUE
library(tidyverse)
flights <- tbl(con, "flights")
planes <- tbl(con, "planes")
airlines <- tbl(con, "airlines")
airports <- tbl(con, "airports")
For each of the exercises, both pure SQL and tidyverse solutions are shown.
SELECT carrier, name, avg(arr_delay)
FROM flights JOIN airlines USING (carrier)
GROUP BY carrier, name;
q <- flights %>%
inner_join(airlines, by = "carrier") %>%
group_by(carrier, name) %>%
summarise(mean(arr_delay, na.rm = TRUE))
q
show_query(q)
<SQL>
SELECT "carrier", "name", AVG("arr_delay") AS "mean(arr_delay, na.rm = TRUE)"
FROM (SELECT "TBL_LEFT"."year" AS "year", "TBL_LEFT"."month" AS "month", "TBL_LEFT"."day" AS "day", "TBL_LEFT"."dep_time" AS "dep_time", "TBL_LEFT"."sched_dep_time" AS "sched_dep_time", "TBL_LEFT"."dep_delay" AS "dep_delay", "TBL_LEFT"."arr_time" AS "arr_time", "TBL_LEFT"."sched_arr_time" AS "sched_arr_time", "TBL_LEFT"."arr_delay" AS "arr_delay", "TBL_LEFT"."carrier" AS "carrier", "TBL_LEFT"."flight" AS "flight", "TBL_LEFT"."tailnum" AS "tailnum", "TBL_LEFT"."origin" AS "origin", "TBL_LEFT"."dest" AS "dest", "TBL_LEFT"."air_time" AS "air_time", "TBL_LEFT"."distance" AS "distance", "TBL_LEFT"."hour" AS "hour", "TBL_LEFT"."minute" AS "minute", "TBL_LEFT"."time_hour" AS "time_hour", "TBL_RIGHT"."name" AS "name"
FROM "flights" AS "TBL_LEFT"
INNER JOIN "airlines" AS "TBL_RIGHT"
ON ("TBL_LEFT"."carrier" = "TBL_RIGHT"."carrier")
) "bfewiupicm"
GROUP BY "carrier", "name"
It is clear to see that while you can easily build valid SQL, it’s not necessarily pretty SQL.
planes
Using an anti join subquery, we filter out the records that do have matches and count the remaining.
SELECT carrier, name, count(*)
FROM flights JOIN airlines USING (carrier)
WHERE NOT EXISTS (
SELECT 1 FROM planes
WHERE tailnum = flights.tailnum
)
GROUP BY carrier, name;
q <- flights %>%
anti_join(planes, by = "tailnum") %>%
inner_join(airlines, by = "carrier") %>%
group_by(carrier, name) %>%
count()
q
show_query(q)
<SQL>
SELECT "carrier", "name", COUNT(*) AS "n"
FROM (SELECT "TBL_LEFT"."year" AS "year", "TBL_LEFT"."month" AS "month", "TBL_LEFT"."day" AS "day", "TBL_LEFT"."dep_time" AS "dep_time", "TBL_LEFT"."sched_dep_time" AS "sched_dep_time", "TBL_LEFT"."dep_delay" AS "dep_delay", "TBL_LEFT"."arr_time" AS "arr_time", "TBL_LEFT"."sched_arr_time" AS "sched_arr_time", "TBL_LEFT"."arr_delay" AS "arr_delay", "TBL_LEFT"."carrier" AS "carrier", "TBL_LEFT"."flight" AS "flight", "TBL_LEFT"."tailnum" AS "tailnum", "TBL_LEFT"."origin" AS "origin", "TBL_LEFT"."dest" AS "dest", "TBL_LEFT"."air_time" AS "air_time", "TBL_LEFT"."distance" AS "distance", "TBL_LEFT"."hour" AS "hour", "TBL_LEFT"."minute" AS "minute", "TBL_LEFT"."time_hour" AS "time_hour", "TBL_RIGHT"."name" AS "name"
FROM (SELECT * FROM "flights" AS "TBL_LEFT"
WHERE NOT EXISTS (
SELECT 1 FROM "planes" AS "TBL_RIGHT"
WHERE ("TBL_LEFT"."tailnum" = "TBL_RIGHT"."tailnum")
)) "TBL_LEFT"
INNER JOIN "airlines" AS "TBL_RIGHT"
ON ("TBL_LEFT"."carrier" = "TBL_RIGHT"."carrier")
) "cuieqoklhp"
GROUP BY "carrier", "name"
Still, aside from the added verbosity, the two queries are largely the same.
SELECT airlines.name AS airline, airports.name AS airport, count(*)
FROM flights
JOIN airlines USING (carrier)
JOIN airports ON (dest = faa)
GROUP BY airline, airport
ORDER BY airline, airport;
Sorted by airline and airport for clarity.
q <- flights %>%
inner_join(airlines, by = "carrier") %>%
inner_join(airports, by = c("dest" = "faa")) %>%
rename(airline = name.x,
airport = name.y) %>%
group_by(airline, airport) %>%
count() %>%
arrange(airline, airport)
q
show_query(q)
<SQL>
SELECT "airline", "airport", COUNT(*) AS "n"
FROM (SELECT "year", "month", "day", "dep_time", "sched_dep_time", "dep_delay", "arr_time", "sched_arr_time", "arr_delay", "carrier", "flight", "tailnum", "origin", "dest", "air_time", "distance", "hour", "minute", "time_hour", "name.x" AS "airline", "name.y" AS "airport", "lat", "lon", "alt", "tz", "dst", "tzone"
FROM (SELECT "TBL_LEFT"."year" AS "year", "TBL_LEFT"."month" AS "month", "TBL_LEFT"."day" AS "day", "TBL_LEFT"."dep_time" AS "dep_time", "TBL_LEFT"."sched_dep_time" AS "sched_dep_time", "TBL_LEFT"."dep_delay" AS "dep_delay", "TBL_LEFT"."arr_time" AS "arr_time", "TBL_LEFT"."sched_arr_time" AS "sched_arr_time", "TBL_LEFT"."arr_delay" AS "arr_delay", "TBL_LEFT"."carrier" AS "carrier", "TBL_LEFT"."flight" AS "flight", "TBL_LEFT"."tailnum" AS "tailnum", "TBL_LEFT"."origin" AS "origin", "TBL_LEFT"."dest" AS "dest", "TBL_LEFT"."air_time" AS "air_time", "TBL_LEFT"."distance" AS "distance", "TBL_LEFT"."hour" AS "hour", "TBL_LEFT"."minute" AS "minute", "TBL_LEFT"."time_hour" AS "time_hour", "TBL_LEFT"."name" AS "name.x", "TBL_RIGHT"."name" AS "name.y", "TBL_RIGHT"."lat" AS "lat", "TBL_RIGHT"."lon" AS "lon", "TBL_RIGHT"."alt" AS "alt", "TBL_RIGHT"."tz" AS "tz", "TBL_RIGHT"."dst" AS "dst", "TBL_RIGHT"."tzone" AS "tzone"
FROM (SELECT "TBL_LEFT"."year" AS "year", "TBL_LEFT"."month" AS "month", "TBL_LEFT"."day" AS "day", "TBL_LEFT"."dep_time" AS "dep_time", "TBL_LEFT"."sched_dep_time" AS "sched_dep_time", "TBL_LEFT"."dep_delay" AS "dep_delay", "TBL_LEFT"."arr_time" AS "arr_time", "TBL_LEFT"."sched_arr_time" AS "sched_arr_time", "TBL_LEFT"."arr_delay" AS "arr_delay", "TBL_LEFT"."carrier" AS "carrier", "TBL_LEFT"."flight" AS "flight", "TBL_LEFT"."tailnum" AS "tailnum", "TBL_LEFT"."origin" AS "origin", "TBL_LEFT"."dest" AS "dest", "TBL_LEFT"."air_time" AS "air_time", "TBL_LEFT"."distance" AS "distance", "TBL_LEFT"."hour" AS "hour", "TBL_LEFT"."minute" AS "minute", "TBL_LEFT"."time_hour" AS "time_hour", "TBL_RIGHT"."name" AS "name"
FROM "flights" AS "TBL_LEFT"
INNER JOIN "airlines" AS "TBL_RIGHT"
ON ("TBL_LEFT"."carrier" = "TBL_RIGHT"."carrier")
) "TBL_LEFT"
INNER JOIN "airports" AS "TBL_RIGHT"
ON ("TBL_LEFT"."dest" = "TBL_RIGHT"."faa")
) "tabfxaztkf") "gcbgkfrtcs"
GROUP BY "airline", "airport"
ORDER BY "airline", "airport"