R语言实战:高级数据管理
目录
本文内容来自《R 语言实战》(R in Action, 2nd),有部分修改
数值和字符处理函数
统计函数
x <- 1:8
x
[1] 1 2 3 4 5 6 7 8
简化版本
mean(x)
[1] 4.5
sd(x)
[1] 2.44949
冗长版本
n <- length(x)
meanx <- sum(x) / n
meanx
[1] 4.5
css <- sum((x - meanx)^2)
sdx <- sqrt(css / (n - 1))
sdx
[1] 2.44949
scale()
函数
scale(x)
[,1]
[1,] -1.4288690
[2,] -1.0206207
[3,] -0.6123724
[4,] -0.2041241
[5,] 0.2041241
[6,] 0.6123724
[7,] 1.0206207
[8,] 1.4288690
attr(,"scaled:center")
[1] 4.5
attr(,"scaled:scale")
[1] 2.44949
概率函数
d
:密度函数p
:分布函数q
:分位数函数r
:随机数
runif(5)
[1] 0.3368141 0.3301134 0.9172243 0.7132270 0.5433076
runif(5)
[1] 0.5018690 0.5012920 0.2691537 0.1911182 0.1047126
set.seed()
设置随机数种子
set.seed(1234)
runif(5)
[1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
set.seed(1234)
runif(5)
[1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
多元正态数据
MASS 包 mvrnorm()
函数
library(MASS)
options(digits=3)
mean <- c(230.7, 146.7, 3.6)
sigma <- matrix(
c(
15360.8, 6721.2, -47.1,
6721.2, 4000.9, -16.5,
-47.1, -16.5, 0.3
),
nrow=3,
ncol=3
)
生成数据
set.seed(1234)
data <- mvrnorm(500, mean, sigma)
data <- as.data.frame(data)
names(data) <- c("y", "x1", "x2")
dim(data)
[1] 500 3
head(data, n=10)
y x1 x2
1 94.2 51.1 3.43
2 249.6 195.6 3.81
3 373.3 189.7 2.51
4 -59.9 15.4 4.71
5 305.9 123.3 3.41
6 290.1 182.2 2.74
7 140.7 154.9 4.41
8 169.4 102.2 3.64
9 165.5 104.8 3.50
10 120.7 96.2 4.09
将函数应用与矩阵和数据框
a <- 5
sqrt(5)
[1] 2.24
b <- c(1.243, 5.654, 2.99)
round(b)
[1] 1 6 3
c <- matrix(runif(12), nrow=3)
c
[,1] [,2] [,3] [,4]
[1,] 0.9636 0.216 0.289 0.913
[2,] 0.2068 0.240 0.804 0.353
[3,] 0.0862 0.197 0.378 0.931
log(c)
[,1] [,2] [,3] [,4]
[1,] -0.0371 -1.53 -1.241 -0.0912
[2,] -1.5762 -1.43 -0.218 -1.0402
[3,] -2.4511 -1.62 -0.972 -0.0710
mean(c)
[1] 0.465
apply()
函数
data <- matrix(rnorm(30), nrow=6)
data
[,1] [,2] [,3] [,4] [,5]
[1,] 0.459 1.203 1.234 0.591 -0.281
[2,] -1.261 0.769 -1.891 -0.435 0.812
[3,] -0.527 0.238 -0.223 -0.251 -0.208
[4,] -0.557 -1.415 0.768 -0.926 1.451
[5,] -0.374 2.934 0.388 1.087 0.841
[6,] -0.604 0.935 0.609 -1.944 -0.866
dim(data)
[1] 6 5
1
表示计算每行的值
apply(data, 1, mean)
[1] 0.641 -0.401 -0.194 -0.136 0.975 -0.374
2
表示计算每列的值
apply(data, 2, mean)
[1] -0.478 0.777 0.148 -0.313 0.292
apply(data, 2, mean, trim=0.2)
[1] -0.516 0.786 0.386 -0.255 0.291
一个数据处理难题
options(digits=2)
Student <- c(
"John Davis",
"Angela Williams",
"Bullwinkle Moose",
"David Jones",
"Janice Markhammer",
"Cheryl Cushing",
"Reuven Ytzrhak",
"Greg Knox",
"Joel England",
"Mary Rayburn"
)
math <- c(
502, 600,
412, 358,
495, 512,
410, 625,
573, 522
)
science <- c(
95, 99,
80, 82,
75, 85,
80, 95,
89, 86
)
english <- c(
25, 22,
18, 15,
20, 28,
15, 30,
27, 18
)
roster <- data.frame(
Student,
math,
science,
english,
stringsAsFactors=FALSE
)
roster
Student math science english
1 John Davis 502 95 25
2 Angela Williams 600 99 22
3 Bullwinkle Moose 412 80 18
4 David Jones 358 82 15
5 Janice Markhammer 495 75 20
6 Cheryl Cushing 512 85 28
7 Reuven Ytzrhak 410 80 15
8 Greg Knox 625 95 30
9 Joel England 573 89 27
10 Mary Rayburn 522 86 18
计算综合得分
z <- scale(roster[, 2:4])
z
math science english
[1,] 0.013 1.078 0.587
[2,] 1.143 1.591 0.037
[3,] -1.026 -0.847 -0.697
[4,] -1.649 -0.590 -1.247
[5,] -0.068 -1.489 -0.330
[6,] 0.128 -0.205 1.137
[7,] -1.049 -0.847 -1.247
[8,] 1.432 1.078 1.504
[9,] 0.832 0.308 0.954
[10,] 0.243 -0.077 -0.697
attr(,"scaled:center")
math science english
501 87 22
attr(,"scaled:scale")
math science english
86.7 7.8 5.5
score <- apply(z, 1, mean)
roster <- cbind(roster, score)
roster
Student math science english score
1 John Davis 502 95 25 0.56
2 Angela Williams 600 99 22 0.92
3 Bullwinkle Moose 412 80 18 -0.86
4 David Jones 358 82 15 -1.16
5 Janice Markhammer 495 75 20 -0.63
6 Cheryl Cushing 512 85 28 0.35
7 Reuven Ytzrhak 410 80 15 -1.05
8 Greg Knox 625 95 30 1.34
9 Joel England 573 89 27 0.70
10 Mary Rayburn 522 86 18 -0.18
评分
y <- quantile(
score,
c(.8, .6, .4, .2)
)
y
80% 60% 40% 20%
0.74 0.44 -0.36 -0.89
roster$grade[score >= y[1]] <- "A"
roster$grade[score < y[1] & score >= y[2]] <- "B"
roster$grade[score < y[2] & score >= y[3]] <- "C"
roster$grade[score < y[3] & score >= y[4]] <- "D"
roster$grade[score < y[4]] <- "F"
roster
Student math science english score grade
1 John Davis 502 95 25 0.56 B
2 Angela Williams 600 99 22 0.92 A
3 Bullwinkle Moose 412 80 18 -0.86 D
4 David Jones 358 82 15 -1.16 F
5 Janice Markhammer 495 75 20 -0.63 D
6 Cheryl Cushing 512 85 28 0.35 C
7 Reuven Ytzrhak 410 80 15 -1.05 F
8 Greg Knox 625 95 30 1.34 A
9 Joel England 573 89 27 0.70 B
10 Mary Rayburn 522 86 18 -0.18 C
按姓氏和名字排序
name <- strsplit((roster$Student), " ")
last_name <- sapply(name, "[", 2)
first_name <- sapply(name, "[", 1)
roster <- roster[order(last_name, first_name),]
roster
Student math science english score grade
6 Cheryl Cushing 512 85 28 0.35 C
1 John Davis 502 95 25 0.56 B
9 Joel England 573 89 27 0.70 B
4 David Jones 358 82 15 -1.16 F
8 Greg Knox 625 95 30 1.34 A
5 Janice Markhammer 495 75 20 -0.63 D
3 Bullwinkle Moose 412 80 18 -0.86 D
10 Mary Rayburn 522 86 18 -0.18 C
2 Angela Williams 600 99 22 0.92 A
7 Reuven Ytzrhak 410 80 15 -1.05 F
控制流
feelings <- c("sad", "afraid")
for (i in feelings) {
print(
switch(
i,
happy = "I am glad you are happy",
afraid = "There is nothing to fear",
sad = "Cheer up",
angry = "Calm down now"
)
)
}
[1] "Cheer up"
[1] "There is nothing to fear"
用户自编函数
my_stats <- function(x, parametric=TRUE, print=FALSE) {
if (parametric) {
center <- mean(x)
spread <- sd(x)
} else {
center <- median(x)
spread <- mad(x)
}
if (print & parametric) {
cat("Mean", center, "\n", "SD=", spread, "\n")
} else if (print & !parametric) {
cat("Median=", center, "\n", "MAD=", spread, "\n")
}
result <- list(center=center, spread=spread)
return(result)
}
set.seed(1234)
x <- rnorm(500)
y <- my_stats(x)
y
$center
[1] 0.0018
$spread
[1] 1
y <- my_stats(x, parametric=FALSE, print=TRUE)
y
Median= -0.021
MAD= 1
$center
[1] -0.021
$spread
[1] 1
my_date <- function(type="long") {
switch(
type,
long = format(Sys.time(), "%A %B %d %Y"),
short = format(Sys.time(), "%m-%d-%y"),
cat(type, "is not a recognized type\n")
)
}
my_date("long")
[1] "星期三 十二月 30 2020"
my_date("short")
[1] "12-30-20"
my_date()
[1] "星期三 十二月 30 2020"
my_date("medium")
medium is not a recognized type
整合与重构
aggregate and reshape
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21 6 160 110 3.9 2.6 16 0 1 4 4
Mazda RX4 Wag 21 6 160 110 3.9 2.9 17 0 1 4 4
Datsun 710 23 4 108 93 3.8 2.3 19 1 1 4 1
Hornet 4 Drive 21 6 258 110 3.1 3.2 19 1 0 3 1
Hornet Sportabout 19 8 360 175 3.1 3.4 17 0 0 3 2
Valiant 18 6 225 105 2.8 3.5 20 1 0 3 1
转置
cars <- mtcars[1:5, 1:4]
cars
mpg cyl disp hp
Mazda RX4 21 6 160 110
Mazda RX4 Wag 21 6 160 110
Datsun 710 23 4 108 93
Hornet 4 Drive 21 6 258 110
Hornet Sportabout 19 8 360 175
t(cars)
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
mpg 21 21 23 21 19
cyl 6 6 4 6 8
disp 160 160 108 258 360
hp 110 110 93 110 175
整合数据
options(digits=3)
attach(mtcars)
agg_data <- aggregate(
mtcars,
by=list(cyl, gear),
FUN=mean,
na.rm=TRUE
)
detach(mtcars)
agg_data
Group.1 Group.2 mpg cyl disp hp drat wt qsec vs am gear carb
1 4 3 21.5 4 120 97 3.70 2.46 20.0 1.0 0.00 3 1.00
2 6 3 19.8 6 242 108 2.92 3.34 19.8 1.0 0.00 3 1.00
3 8 3 15.1 8 358 194 3.12 4.10 17.1 0.0 0.00 3 3.08
4 4 4 26.9 4 103 76 4.11 2.38 19.6 1.0 0.75 4 1.50
5 6 4 19.8 6 164 116 3.91 3.09 17.7 0.5 0.50 4 4.00
6 4 5 28.2 4 108 102 4.10 1.83 16.8 0.5 1.00 5 2.00
7 6 5 19.7 6 145 175 3.62 2.77 15.5 0.0 1.00 5 6.00
8 8 5 15.4 8 326 300 3.88 3.37 14.6 0.0 1.00 5 6.00
reshape2
示例数据,包括两个标识符变量 ID
和 Time
,两个测量变量 X1
和 X2
my_data <- data.frame(
ID=c(1, 1, 2, 2),
Time=c(1, 2, 1, 2),
X1=c(5, 3, 6, 2),
X2=c(6, 5, 1, 4)
)
my_data
ID Time X1 X2
1 1 1 5 6
2 1 2 3 5
3 2 1 6 1
4 2 2 2 4
library(reshape2)
融合 melt()
每一行都是一个单独的测量,表示为:
唯一标识符 + 测量变量
md <- melt(my_data, id=c("ID", "Time"))
md
ID Time variable value
1 1 1 X1 5
2 1 2 X1 3
3 2 1 X1 6
4 2 2 X1 2
5 1 1 X2 6
6 1 2 X2 5
7 2 1 X2 1
8 2 2 X2 4
重铸 dcast()
不执行整合
相当于被重塑 (reshape)
dcast(md, ID + Time ~ variable)
ID Time X1 X2
1 1 1 5 6
2 1 2 3 5
3 2 1 6 1
4 2 2 2 4
dcast(md, ID + variable ~ Time)
ID variable 1 2
1 1 X1 5 3
2 1 X2 6 5
3 2 X1 6 2
4 2 X2 1 4
dcast(md, ID ~ variable + Time)
ID X1_1 X1_2 X2_1 X2_2
1 1 5 3 6 5
2 2 6 2 1 4
执行整合
附加整合函数,例如 mean
dcast(md, ID ~ variable, mean.default)
ID X1 X2
1 1 4 5.5
2 2 4 2.5
dcast(md, Time~variable, mean.default)
Time X1 X2
1 1 5.5 3.5
2 2 2.5 4.5
dcast(md, ID~Time, mean.default)
ID 1 2
1 1 5.5 4
2 2 3.5 3
参考
R 语言实战
《图形初阶》
《基本数据管理》