Summarising data frame
데이터 프레임 살펴보기
최근 특강을 준비하면서, 데이터 프레임 전체에 대해 요약/정리하는 패키지/함수가 여럿 개발되었음을 확인하였습니다. 여기서는 mtcars
를 활용하여 이들을 사용하는 예를 보이겠습니다.
mtcars
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb ## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 ## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 ## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 ## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 ## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 ## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
sapply(mtcars, class)
## mpg cyl disp hp drat wt qsec vs am gear carb ## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
우선 mtcars
을 다음과 같이 수정합니다.
library(dplyr)
mtcars2 <-
mtcars %>% mutate(am = factor(ifelse(am==0, 'auto', 'manual')),
vs = factor(ifelse(vs==0, 'V-shaped', 'straight')))
mtcars3 <-
mtcars %>% mutate(cyl = ordered(cyl, levels=c(4,6,8)),
gear = ordered(gear, levels=c(3,4,5)),
carb = ordered(carb, levels=c(1,2,3,4,6,8)))
dat <- mtcars2
dat2 <- mtcars3
기본 텍스트형
summary()
prettyR::freq()
psych::describe()
Hmisc::describe()
pastecs::stat.desc()
summary(dat)
## mpg cyl disp hp drat wt qsec ## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0 Min. :2.760 Min. :1.513 Min. :14.50 ## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 ## Median :19.20 Median :6.000 Median :196.3 Median :123.0 Median :3.695 Median :3.325 Median :17.71 ## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7 Mean :3.597 Mean :3.217 Mean :17.85 ## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 ## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0 Max. :4.930 Max. :5.424 Max. :22.90 ## vs am gear carb ## straight:14 auto :19 Min. :3.000 Min. :1.000 ## V-shaped:18 manual:13 1st Qu.:3.000 1st Qu.:2.000 ## Median :4.000 Median :2.000 ## Mean :3.688 Mean :2.812 ## 3rd Qu.:4.000 3rd Qu.:4.000 ## Max. :5.000 Max. :8.000
prettyR::freq(dat)
## ## Frequencies for mpg ## 10.4 15.2 19.2 21 21.4 22.8 30.4 13.3 14.3 14.7 15 15.5 15.8 16.4 17.3 17.8 18.1 18.7 19.7 21.5 24.4 26 27.3 32.4 33.9 NA ## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 ## % 6.2 6.2 6.2 6.2 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 0 ## %!NA 6.2 6.2 6.2 6.2 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ## ## ## Frequencies for cyl ## 8 4 6 NA ## 14 11 7 0 ## % 43.8 34.4 21.9 0 ## %!NA 43.8 34.4 21.9 ## ## ## Frequencies for disp ## 275.8 160 167.6 360 71.1 75.7 78.7 79 95.1 108 120.1 120.3 121 140.8 145 146.7 225 258 301 304 318 350 351 400 440 460 472 NA ## 3 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 ## % 9.4 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 0 ## %!NA 9.4 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ## ## ## Frequencies for hp ## 110 175 180 66 123 150 245 52 62 65 91 93 95 97 105 109 113 205 215 230 264 335 NA ## 3 3 3 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 ## % 9.4 9.4 9.4 6.2 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 0 ## %!NA 9.4 9.4 9.4 6.2 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ## ## ## Frequencies for drat ## 3.07 3.92 2.76 3.08 3.15 3.9 4.08 4.22 2.93 3 3.21 3.23 3.54 3.62 3.69 3.7 3.73 3.77 3.85 4.11 4.43 4.93 NA ## 3 3 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 ## % 9.4 9.4 6.2 6.2 6.2 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 0 ## %!NA 9.4 9.4 6.2 6.2 6.2 6.2 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ## ## ## Frequencies for wt ## 3.44 3.57 1.513 1.615 1.835 1.935 2.14 2.2 2.32 2.465 2.62 2.77 2.78 2.875 3.15 3.17 3.19 3.215 3.435 3.46 3.52 3.73 3.78 3.84 3.845 4.07 5.25 5.345 5.424 NA ## 3 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 ## % 9.4 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 0 ## %!NA 9.4 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ## ## ## Frequencies for qsec ## 17.02 18.9 14.5 14.6 15.41 15.5 15.84 16.46 16.7 16.87 16.9 17.05 17.3 17.4 17.42 17.6 17.82 17.98 18 18.3 18.52 18.6 18.61 19.44 19.47 19.9 20 20.01 20.22 22.9 NA ## 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 ## % 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 0 ## %!NA 6.2 6.2 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ## ## ## Frequencies for vs ## V-shaped straight NA ## 18 14 0 ## % 56.2 43.8 0 ## %!NA 56.2 43.8 ## ## ## Frequencies for am ## auto manual NA ## 19 13 0 ## % 59.4 40.6 0 ## %!NA 59.4 40.6 ## ## ## Frequencies for gear ## 3 4 5 NA ## 15 12 5 0 ## % 46.9 37.5 15.6 0 ## %!NA 46.9 37.5 15.6 ## ## ## Frequencies for carb ## 2 4 1 3 6 8 NA ## 10 10 7 3 1 1 0 ## % 31.2 31.2 21.9 9.4 3.1 3.1 0 ## %!NA 31.2 31.2 21.9 9.4 3.1 3.1
psych::describe(dat)
## vars n mean sd median trimmed mad min max range skew kurtosis se ## mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37 1.07 ## cyl 2 32 6.19 1.79 6.00 6.23 2.97 4.00 8.00 4.00 -0.17 -1.76 0.32 ## disp 3 32 230.72 123.94 196.30 222.52 140.48 71.10 472.00 400.90 0.38 -1.21 21.91 ## hp 4 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14 12.12 ## drat 5 32 3.60 0.53 3.70 3.58 0.70 2.76 4.93 2.17 0.27 -0.71 0.09 ## wt 6 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02 0.17 ## qsec 7 32 17.85 1.79 17.71 17.83 1.42 14.50 22.90 8.40 0.37 0.34 0.32 ## vs* 8 32 1.56 0.50 2.00 1.58 0.00 1.00 2.00 1.00 -0.24 -2.00 0.09 ## am* 9 32 1.41 0.50 1.00 1.38 0.00 1.00 2.00 1.00 0.36 -1.92 0.09 ## gear 10 32 3.69 0.74 4.00 3.62 1.48 3.00 5.00 2.00 0.53 -1.07 0.13 ## carb 11 32 2.81 1.62 2.00 2.65 1.48 1.00 8.00 7.00 1.05 1.26 0.29
#Hmisc::describe(dat)
#pastecs::stat.desc(dat)
그리고 또
skimr::skim()
DataExplorer::create_report()
inspectdf::inspect_types()
inspect_mem()
inspect_cat()
inspect_cor()
#install.packages('skimr')
library(skimr)
skim(dat)
Name | dat |
Number of rows | 32 |
Number of columns | 11 |
_______________________ | |
Column type frequency: | |
factor | 2 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
vs | 0 | 1 | FALSE | 2 | V-s: 18, str: 14 |
am | 0 | 1 | FALSE | 2 | aut: 19, man: 13 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
mpg | 0 | 1 | 20.09 | 6.03 | 10.40 | 15.43 | 19.20 | 22.80 | 33.90 | ▃▇▅▁▂ |
cyl | 0 | 1 | 6.19 | 1.79 | 4.00 | 4.00 | 6.00 | 8.00 | 8.00 | ▆▁▃▁▇ |
disp | 0 | 1 | 230.72 | 123.94 | 71.10 | 120.83 | 196.30 | 326.00 | 472.00 | ▇▃▃▃▂ |
hp | 0 | 1 | 146.69 | 68.56 | 52.00 | 96.50 | 123.00 | 180.00 | 335.00 | ▇▇▆▃▁ |
drat | 0 | 1 | 3.60 | 0.53 | 2.76 | 3.08 | 3.70 | 3.92 | 4.93 | ▇▃▇▅▁ |
wt | 0 | 1 | 3.22 | 0.98 | 1.51 | 2.58 | 3.33 | 3.61 | 5.42 | ▃▃▇▁▂ |
qsec | 0 | 1 | 17.85 | 1.79 | 14.50 | 16.89 | 17.71 | 18.90 | 22.90 | ▃▇▇▂▁ |
gear | 0 | 1 | 3.69 | 0.74 | 3.00 | 3.00 | 4.00 | 4.00 | 5.00 | ▇▁▆▁▂ |
carb | 0 | 1 | 2.81 | 1.62 | 1.00 | 2.00 | 2.00 | 4.00 | 8.00 | ▇▂▅▁▁ |
#install.packages('inspectdf')
library(inspectdf)
#inspect_types(dat2) %>% show_plot()
#inspect_types(dat2, dat) %>% show_plot()
#inspect_mem(dat2) %>% show_plot()
#inspect_na(dat2) %>% show_plot()
#inspect_num(dat2) %>% show_plot()
#inspect_imb(dat2) %>% show_plot()
#inspect_cat(dat2) %>% show_plot()
#inspect_cor(dat2) %>% show_plot()
GGally::ggpairs()
R의 기본 그래픽 함수 pairs()
나 GGally::ggpairs()
역시 요긴하다!
pairs(dat2)
GGally::ggpairs(dat2)
P.S
마지막으로 create_report(dat)
의 결과는 …
#install.packages('DataExplorer')
library(DataExplorer)
create_report(dat)
Leave a comment