R语言实战:基本数据管理
目录
本文内容来自《R 语言实战》(R in Action, 2nd),有部分修改
示例
构造 data.frame
对象
manager <- 1:5
date <- c(
"10/24/08",
"10/28/08",
"10/1/08",
"10/12/08",
"5/1/09"
)
country <- c(
"US",
"US",
"UK",
"UK",
"UK"
)
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(
manager,
date,
country,
gender,
age,
q1,
q2,
q3,
q4,
q5,
stringsAsFactors=FALSE
)
leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F 99 2 2 1 2 1
每一列表示一个变量,每一行表示一个观测
创建新变量
三种方式
赋值
mydata <- data.frame(
x1=c(2, 2, 6, 4),
x2=c(3, 4, 2, 8)
)
mydata$sumx <- mydata$x1 + mydata$x2
mydata$meanx <- (mydata$x1 + mydata$x2)/2
使用 attach()
省略对象名
attach(mydata)
mydata$sumx <- x1 + x2
mydata$meanx <- (x1 + x2)/2
detach(mydata)
transform()
函数
mydata <- transform(
mydata,
sumx=x1 + x2,
meanx=(x1 + x2)/2
)
变量的重编码
修改现有变量
leadership$age[leadership$age == 99] <- NA
leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F NA 2 2 1 2 1
增加新变量
leadership$agecat[leadership$age > 75] <- "Elder"
leadership$agecat[
leadership$age >= 55 &
leadership$age <= 75
] <- "Middle Aged"
leadership$agecat[leadership$age < 55] <- "Young"
leadership
manager date country gender age q1 q2 q3 q4 q5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
使用 within
简化代码
leadership <- within(
leadership,
{
agecat <- NA
agecat[age > 75] <- "Elder"
agecat[age >= 55 & age <= 75] <- "Middle Aged"
agecat[age < 55] <- "Young"
}
)
变量的重命名
names()
函数
names(leadership)
[1] "manager" "date" "country" "gender" "age" "q1" "q2" "q3"
[9] "q4" "q5" "agecat"
names(leadership)[2] <- "testDate"
leadership
manager testDate country gender age q1 q2 q3 q4 q5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
plyr
包中的 rename()
函数
library(plyr)
leadership <- data.frame(
manager,
date,
country,
gender,
age,
q1,
q2,
q3,
q4,
q5,
stringsAsFactors=FALSE
)
leadership <- rename(
leadership,
c(
manager="managerID",
date="testDate"
)
)
leadership
managerID testDate country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F 99 2 2 1 2 1
缺失值
is.na()
函数
is.na(leadership[, 6:10])
q1 q2 q3 q4 q5
[1,] FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE TRUE TRUE
[5,] FALSE FALSE FALSE FALSE FALSE
NA
等特殊值无法使用逻辑关系符比较
na_var <- NA
na_var == NA
[1] NA
Inf
和 -Inf
表示正无穷大和负无穷大
is.infinite(5/0)
[1] TRUE
NaN
表示不可能的值
is.nan(NaN)
[1] TRUE
重编码某些值为缺失值
leadership <- data.frame(
manager,
date,
country,
gender,
age,
q1,
q2,
q3,
q4,
q5,
stringsAsFactors=FALSE
)
leadership$age[leadership$age == 99] <- NA
leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F NA 2 2 1 2 1
在分析中排除缺失值
x <- c(1, 2, NA, 3)
x[1] + x[2] + x[3] + x[4]
[1] NA
sum(x)
[1] NA
na.rm
选项移除缺失值
sum(x, na.rm=TRUE)
[1] 6
na.omit()
函数删除包含缺失值的观测,称为行删除。
注:该方法仅是处理缺失值的一种方法。
leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F NA 2 2 1 2 1
trim_leadership <- na.omit(leadership)
trim_leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
日期
as.Date()
默认格式为 yyyy-mm-dd
as.Date(c("2007-06-22", "2004-02-13"))
[1] "2007-06-22" "2004-02-13"
格式字符串
as.Date(c("01/05/1965", "08/16/1975"), "%m/%d/%Y")
[1] "1965-01-05" "1975-08-16"
leadership$date <- as.Date(leadership$date, "%m/%d/%y")
leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 2008-10-24 US M 32 5 4 5 5 5
2 2 2008-10-28 US F 45 3 5 2 5 5
3 3 2008-10-01 UK F 25 3 5 5 5 2
4 4 2008-10-12 UK M 39 3 3 4 NA NA
5 5 2009-05-01 UK F NA 2 2 1 2 1
Sys.Date()
获取日期
Sys.Date()
[1] "2020-12-28"
date()
获取时间
date()
[1] "Mon Dec 28 20:36:48 2020"
format()
函数格式化日期
today <- Sys.Date()
format(today, format="%B %d %y")
[1] "十二月 28 20"
format(today, format="%A")
[1] "星期一"
算术运算
start_date <- as.Date("2004-02-13")
end_date <- as.Date("2011-01-22")
days <- end_date - start_date
days
Time difference of 2535 days
difftime()
today <- Sys.Date()
dob <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")
Time difference of 3350.429 weeks
将日期转换为字符串变量
str_date <- as.character(today)
str_date
[1] "2020-12-28"
lubridate 包
library(lubridate)
解析日期和时间
ymd("20110604")
[1] "2011-06-04"
mdy("06-04-2011")
[1] "2011-06-04"
dmy("04/06/2011")
[1] "2011-06-04"
解析时间
arrive <- ymd_hms(
"2011-06-04 12:00:00",
tz="Pacific/Auckland"
)
arrive
[1] "2011-06-04 12:00:00 NZST"
leave <- ymd_hms(
"2011-08-10 14:00:00",
tz="Pacific/Auckland"
)
leave
[1] "2011-08-10 14:00:00 NZST"
设置和提取信息
second(arrive)
[1] 0
second(arrive) <- 25
arrive
[1] "2011-06-04 12:00:25 NZST"
wday(arrive)
[1] 7
wday(arrive, label=TRUE)
[1] 周六
Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
时区
meeting <- ymd_hms(
"2011-07-01 09:00:00",
tz="Pacific/Auckland"
)
with_tz(
meeting,
"America/Chicago"
)
[1] "2011-06-30 16:00:00 CDT"
mistake <- force_tz(
meeting,
"America/Chicago"
)
with_tz(
mistake,
"Pacific/Auckland"
)
[1] "2011-07-02 02:00:00 NZST"
时间段
interval()
函数
auckland <- interval(arrive, leave)
auckland
[1] 2011-06-04 12:00:25 NZST--2011-08-10 14:00:00 NZST
%--%
运算符
auckland <- arrive %--% leave
auckland
[1] 2011-06-04 12:00:25 NZST--2011-08-10 14:00:00 NZST
jsm <- interval(
ymd(20110720, tz="Pacific/Auckland"),
ymd(20110831, tz="Pacific/Auckland")
)
jsm
[1] 2011-07-20 NZST--2011-08-31 NZST
是否有重叠
int_overlaps(jsm, auckland)
[1] TRUE
setdiff(auckland, jsm)
[1] 2011-06-04 12:00:25 NZST--2011-07-20 NZST
日期算术计算
Period
minutes(2)
[1] "2M 0S"
Duration
dminutes(2)
[1] "120s (~2 minutes)"
Duration 与 Period 的区别
leap_year(2011)
[1] FALSE
ymd(20110101) + dyears(1)
[1] "2012-01-01 06:00:00 UTC"
ymd(20110101) + years(1)
[1] "2012-01-01"
leap_year(2012)
[1] TRUE
ymd(20120101) + dyears(1)
[1] "2012-12-31 06:00:00 UTC"
ymd(20120101) + years(1)
[1] "2013-01-01"
计算
meetings <- meeting + weeks(0:5)
meetings
[1] "2011-07-01 09:00:00 NZST" "2011-07-08 09:00:00 NZST" "2011-07-15 09:00:00 NZST"
[4] "2011-07-22 09:00:00 NZST" "2011-07-29 09:00:00 NZST" "2011-08-05 09:00:00 NZST"
%within
meetings %within% jsm
[1] FALSE FALSE FALSE TRUE TRUE TRUE
auckland / ddays(1)
[1] 67.08304
auckland / ddays(2)
[1] 33.54152
auckland / dminutes(1)
[1] 96599.58
auckland %/% months(1)
[1] 2
auckland %% months(1)
[1] 2011-08-04 12:00:25 NZST--2011-08-10 14:00:00 NZST
as.period()
函数
as.period(auckland %% months(1))
[1] "6d 1H 59M 35S"
as.period(auckland)
[1] "2m 6d 1H 59M 35S"
不合理时间
jan31 <- ymd("2013-01-31")
jan31 + months(0:11)
[1] "2013-01-31" NA "2013-03-31" NA "2013-05-31" NA
[7] "2013-07-31" "2013-08-31" NA "2013-10-31" NA "2013-12-31"
floor_date(jan31, "month") + months(0:11) + days(31)
[1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01" "2013-07-02"
[7] "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01" "2013-12-02" "2014-01-01"
jan31 %m+% months(0:11)
[1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31" "2013-06-30"
[7] "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31" "2013-11-30" "2013-12-31"
向量化
lubridate 代码已开启向量化
类型转换
a <- c(1, 2, 3)
a
[1] 1 2 3
is.numeric(a)
[1] TRUE
a <- as.character(a)
a
[1] "1" "2" "3"
is.numeric(a)
[1] FALSE
is.vector(a)
[1] TRUE
is.character(a)
[1] TRUE
数据排序
order()
函数
new_data <- leadership[order(leadership$age),]
new_data
manager date country gender age q1 q2 q3 q4 q5
3 3 2008-10-01 UK F 25 3 5 5 5 2
1 1 2008-10-24 US M 32 5 4 5 5 5
4 4 2008-10-12 UK M 39 3 3 4 NA NA
2 2 2008-10-28 US F 45 3 5 2 5 5
5 5 2009-05-01 UK F NA 2 2 1 2 1
多重排序
attach(leadership)
new_data <- leadership[order(gender, age),]
detach(leadership)
new_data
manager date country gender age q1 q2 q3 q4 q5
3 3 2008-10-01 UK F 25 3 5 5 5 2
2 2 2008-10-28 US F 45 3 5 2 5 5
5 5 2009-05-01 UK F NA 2 2 1 2 1
1 1 2008-10-24 US M 32 5 4 5 5 5
4 4 2008-10-12 UK M 39 3 3 4 NA NA
逆序
attach(leadership)
new_data <- leadership[order(gender, -age),]
detach(leadership)
new_data
manager date country gender age q1 q2 q3 q4 q5
5 5 2009-05-01 UK F NA 2 2 1 2 1
2 2 2008-10-28 US F 45 3 5 2 5 5
3 3 2008-10-01 UK F 25 3 5 5 5 2
4 4 2008-10-12 UK M 39 3 3 4 NA NA
1 1 2008-10-24 US M 32 5 4 5 5 5
数据集的合并
向数据框添加列
merge()
函数
total <- merge(dataframeA, dataframeB, by="ID")
total <- merge(dataframeA, dataframeB, by=c("ID", "Country"))
向数据框添加行
total <- rbind(dataframeA, dataframeB)
数据集取子集
选入(保留)变量
变量序号
new_data <- leadership[, c(6:10)]
new_data
q1 q2 q3 q4 q5
1 5 4 5 5 5
2 3 5 2 5 5
3 3 5 5 5 2
4 3 3 4 NA NA
5 2 2 1 2 1
变量名称
selected_vars <- c("q1", "q2", "q3", "q4", "q5")
new_data <- leadership[selected_vars]
new_data
q1 q2 q3 q4 q5
1 5 4 5 5 5
2 3 5 2 5 5
3 3 5 5 5 2
4 3 3 4 NA NA
5 2 2 1 2 1
selected_vars <- paste("q", 1:5, sep="")
new_data <- leadership[selected_vars]
new_data
q1 q2 q3 q4 q5
1 5 4 5 5 5
2 3 5 2 5 5
3 3 5 5 5 2
4 3 3 4 NA NA
5 2 2 1 2 1
剔除(丢弃)变量
变量名
selected_vars <- names(leadership) %in% c("q3", "q4")
selected_vars
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
new_data <- leadership[!selected_vars]
new_data
manager date country gender age q1 q2 q5
1 1 2008-10-24 US M 32 5 4 5
2 2 2008-10-28 US F 45 3 5 5
3 3 2008-10-01 UK F 25 3 5 2
4 4 2008-10-12 UK M 39 3 3 NA
5 5 2009-05-01 UK F NA 2 2 1
变量序号
new_data <- leadership[c(-8, -9)]
new_data
manager date country gender age q1 q2 q5
1 1 2008-10-24 US M 32 5 4 5
2 2 2008-10-28 US F 45 3 5 5
3 3 2008-10-01 UK F 25 3 5 2
4 4 2008-10-12 UK M 39 3 3 NA
5 5 2009-05-01 UK F NA 2 2 1
设置为 NULL
leadership$q3 <- leadership$q4 <- NULL
leadership
manager date country gender age q1 q2 q5
1 1 2008-10-24 US M 32 5 4 5
2 2 2008-10-28 US F 45 3 5 5
3 3 2008-10-01 UK F 25 3 5 2
4 4 2008-10-12 UK M 39 3 3 NA
5 5 2009-05-01 UK F NA 2 2 1
选入观测
leadership <- data.frame(
manager,
date,
country,
gender,
age,
q1,
q2,
q3,
q4,
q5,
stringsAsFactors=FALSE
)
序号
new_data <- leadership[1:3, ]
new_data
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
条件筛选
new_data <- leadership[
leadership$gender=="M" &
leadership$age > 30,
]
new_data
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
4 4 10/12/08 UK M 39 3 3 4 NA NA
attach(leadership)
new_data <- leadership[
gender == "M" & age > 30,
]
detach(leadership)
new_data
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
4 4 10/12/08 UK M 39 3 3 4 NA NA
which()
函数
leadership$date <- as.Date(
leadership$date,
"%m/%d/%y"
)
start_date <- as.Date("2009-01-01")
end_date <- as.Date("2009-10-31")
new_data <- leadership[
which(
leadership$date >= start_date &
leadership$date <= end_date
),
]
new_data
manager date country gender age q1 q2 q3 q4 q5
5 5 2009-05-01 UK F 99 2 2 1 2 1
subset()
函数
new_data <- subset(
leadership,
age >= 35 | age < 24,
select=c(q1, q2, q3, q4)
)
new_data
q1 q2 q3 q4
2 3 5 2 5
4 3 3 4 NA
5 2 2 1 2
new_data <- subset(
leadership,
gender == "M" & age > 25,
select=gender:q4
)
new_data
gender age q1 q2 q3 q4
1 M 32 5 4 5 5
4 M 39 3 3 4 NA
随机抽样
sample()
sample_data <- leadership[
sample(
1:nrow(leadership),
3,
replace=FALSE
),
]
sample_data
manager date country gender age q1 q2 q3 q4 q5
5 5 2009-05-01 UK F 99 2 2 1 2 1
1 1 2008-10-24 US M 32 5 4 5 5 5
4 4 2008-10-12 UK M 39 3 3 4 NA NA
使用 SQL 语句操作数据框
sqldf 包
library(sqldf)
df <- sqldf(
"select * from mtcars where carb=1 order by mpg",
row.names=TRUE
)
df
mpg cyl disp hp drat wt qsec vs am gear carb
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
sqldf(
paste(
"select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear from mtcars",
"where cyl in (4, 6) group by gear",
sep=" "
)
)
avg_mpg avg_disp gear
1 20.33333 201.0333 3
2 24.53333 123.0167 4
3 25.36667 120.1333 5
参考
R 语言实战