dplyr을 SQL로
install.packages(dplyr)
install.packages(nycflights13)
install.packages(sqldf)
#install.packages('RSQLite')
dplyr을 SQL로 번역하기
데이터 분석가에게 필요한 역량 중의 하나가 SQL(Structured Query Language)이다. 하지만 시간을 내어 배우기 힘들고, 간단하게 사용해야 할 필요가 있을 때에는 간단하게 dplyr
함수를 번역해서 쓸 수 있다.
아래에는 수기로 하는 번역과 dbplyr
의 함수를 활용하여 자동으로 번역하는 방법을 소개한다.
수기 번역
아래의 표는 dplyr
의 대표적인 함수를 SQL
로 번역한 결과를 보여준다.
package:dplyr |
SQL |
---|---|
dat %>% select(cola,colb) |
SELECT cola, colb FROM dat |
head(5) |
LIMIT 5 |
filter(cola > 3) |
WHERE cola > 3 |
filter(cola > 3 & colb < 5) |
WHERE cola > 3 AND colb < 5 |
filter(is.na(cola)) |
WHERE cola IS NULL |
select(cola) %>% filter(!duplicated(cola)) |
SELECT DISTINCT cola |
filter(year(coldate) >= 2002 & year(coldate) <= 2003) |
WHERE coldate BETWEEN 2002 AND 2003 |
filter(country == "Korea") |
WHERE country = 'Korea' |
filter(str_detect(cola, '^.a')) |
WHERE cola LIKE '_a%' |
filter(str_detect(name, "^B")) |
WHERE name LIKE 'B%' |
transmute(colc = cola - colb) |
SELECT cola - colb AS colc |
mutate(result=8 %/% 3) |
SELECT (8/3) AS result |
group_by(colb) |
GROUP BY colb |
arrange(cola) |
ORDER BY cola |
filter(!duplicated(cola)) %>% nrow |
SELECT COUNT(DISTINCT cola) |
nrow(dfA) |
SELECT COUNT(*) FROM dfA |
SQL
에서 몇 가지 주의할 점은 다음같 같다.
SQL
에서 문자열은'
로 감싼다."
를 사용할 수 없다.SQL
에서NULL
은 결측치를 나타낸다.- 문자열 패턴을 나타내는
'B%'
,'_a%'
을 유의해서 보자. - 위
dplyr
함수는dat %>%
가 생략되었다.SQL
에서도FROM dat
이 생략되었다.
실례
- 여기서는 R의 데이터 프레임에 SQL 명령을 사용할 수 있는
sqldf
함수로dplyr
의 함수와 SQL의 방법을 비교해 보았다.
library(dplyr)
library(sqldf)
#library(RSQLite)
library(nycflights13)
head(flights)
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## 4 2013 1 1 544 545 -1 1004 ## 5 2013 1 1 554 600 -6 812 ## 6 2013 1 1 554 558 -4 740 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm>
dat <- flights
a1 <- dat %>% select(carrier, origin, dest)
a2 <- sqldf('SELECT carrier, origin, dest FROM dat')
all.equal(a1, as_tibble(a2))
## [1] TRUE
(r1 <- dat %>% head(5))
r2 <- as_tibble(sqldf('SELECT * FROM dat LIMIT 5'))
all.equal(r1,r2)
## # A tibble: 5 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## 4 2013 1 1 544 545 -1 1004 ## 5 2013 1 1 554 600 -6 812 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm> ## [1] TRUE
(r1 <- dat %>% filter(dep_delay > 1000)) %>% head
r2 <- as_tibble(sqldf('SELECT * FROM dat WHERE dep_delay > 1000'))
all.equal(r1, r2)
## # A tibble: 5 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 9 641 900 1301 1242 ## 2 2013 1 10 1121 1635 1126 1239 ## 3 2013 6 15 1432 1935 1137 1607 ## 4 2013 7 22 845 1600 1005 1044 ## 5 2013 9 20 1139 1845 1014 1457 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm> ## [1] TRUE
(r1 <- dat %>% filter(dep_delay > 1000 & arr_time < 1000)) %>% head
r2 <- as_tibble(sqldf('SELECT * FROM dat WHERE dep_delay > 1000 AND arr_time < 1000')) %>% head
all.equal(r1,r2)
## # A tibble: 0 x 19 ## # ... with 19 variables: year <int>, month <int>, day <int>, ## # dep_time <int>, sched_dep_time <int>, dep_delay <dbl>, arr_time <int>, ## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, ## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> ## [1] TRUE
(r1 <- dat %>% filter(is.na(tailnum))) %>% head
r2 <- as_tibble(sqldf('SELECT * FROM dat WHERE tailnum IS NULL'))
all.equal(r1, r2)
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 2 NA 1545 NA NA ## 2 2013 1 2 NA 1601 NA NA ## 3 2013 1 3 NA 857 NA NA ## 4 2013 1 3 NA 645 NA NA ## 5 2013 1 4 NA 845 NA NA ## 6 2013 1 4 NA 1830 NA NA ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm> ## [1] TRUE
(r1 <- dat %>% select(carrier) %>% filter(!duplicated(carrier)))
r2 <- as_tibble(sqldf('SELECT DISTINCT carrier FROM dat'))
all.equal(r1, r2)
## # A tibble: 16 x 1 ## carrier ## <chr> ## 1 UA ## 2 AA ## 3 B6 ## 4 DL ## 5 EV ## 6 MQ ## 7 US ## 8 WN ## 9 VX ## 10 FL ## 11 AS ## 12 9E ## 13 F9 ## 14 HA ## 15 YV ## 16 OO ## [1] TRUE
다음의 경우 결과의 형태가 다르다는 점을 유의하자(첫 번째 결과는 벡터이고, 두 번째 결과는 티블이다).
(r1 <- dat %>% select(carrier) %>% filter(!duplicated(carrier)) %>% nrow)
r2 <- as_tibble(sqldf('SELECT COUNT(DISTINCT carrier) FROM dat'))
all.equal(r1, r2)
## [1] 16 ## [1] "Modes: numeric, list" ## [2] "names for current but not for target" ## [3] "Attributes: < target is NULL, current is list >" ## [4] "target is numeric, current is tbl_df"
(r1 <- dat %>% transmute(speed = distance/air_time)) %>% head
r2 <- as_tibble(sqldf('SELECT distance/air_time AS speed FROM dat'))
all.equal(r1, r2)
## # A tibble: 6 x 1 ## speed ## <dbl> ## 1 6.17 ## 2 6.24 ## 3 6.81 ## 4 8.61 ## 5 6.57 ## 6 4.79 ## [1] TRUE
library(stringr)
(r1 <- dat %>% filter(str_detect(carrier, "^B"))) %>% head
r2 <- as_tibble(sqldf("SELECT * FROM dat WHERE carrier LIKE 'B%'"))
all.equal(r1, r2)
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 544 545 -1 1004 ## 2 2013 1 1 555 600 -5 913 ## 3 2013 1 1 557 600 -3 838 ## 4 2013 1 1 558 600 -2 849 ## 5 2013 1 1 558 600 -2 853 ## 6 2013 1 1 559 559 0 702 ## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm> ## [1] TRUE
(r1 <- dat %>% transmute(a = 8 %/% 3)) %>% head
r2 <- as_tibble(sqldf('SELECT (8/3) AS a FROM dat'))
all.equal(r1, r2)
## # A tibble: 6 x 1 ## a ## <dbl> ## 1 2 ## 2 2 ## 3 2 ## 4 2 ## 5 2 ## 6 2 ## [1] "Incompatible type for column `a`: x numeric, y integer"
dbplyr
패키지를 사용한 자동 번역
dbplyr
은 dplyr -> SQL 번역을 자동으로 해 준다. 관련링크: sql-translation
단순한 예를 들어 보자. 위의 링크에서 가져왔다.
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
flights <- copy_to(con, nycflights13::flights)
airports <- copy_to(con, nycflights13::airports)
flights %>%
select(contains("delay")) %>%
show_query()
## <SQL> ## SELECT `dep_delay`, `arr_delay` ## FROM `nycflights13::flights`
결과를 실제 활용해 보면,
DBI::dbGetQuery(con, 'SELECT `dep_delay`, `arr_delay` FROM `nycflights13::flights`') %>% head
## dep_delay arr_delay ## 1 2 11 ## 2 4 20 ## 3 2 33 ## 4 -1 -18 ## 5 -6 -25 ## 6 -4 12
실례
flights.db
생성
dbFilename <- "flights.db"
if (file.exists(dbFilename)) file.remove(dbFilename)
con2 <- DBI::dbConnect(drv=SQLite(), dbname=dbFilename)
dbListTables(con2)
data(flights)
dbWriteTable(con2, "flights", flights)
dbListTables(con2)
dbDisconnect(con2)
- 위에서 번역된 SQL문 사용
con2 <- DBI::dbConnect(drv=SQLite(), dbname=dbFilename)
DBI::dbGetQuery(con2, 'SELECT `dep_delay`, `arr_delay` FROM flights') %>% head
dbDisconnect(con2)
Leave a comment