R语言实战:基本数据管理

目录

本文内容来自《R 语言实战》(R in Action, 2nd),有部分修改

示例

构造 data.frame 对象

manager <- 1:5
date <- c(
  "10/24/08",
  "10/28/08",
  "10/1/08",
  "10/12/08",
  "5/1/09"
)
country <- c(
  "US",
  "US",
  "UK",
  "UK",
  "UK"
)
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)

leadership <- data.frame(
  manager,
  date,
  country,
  gender,
  age,
  q1,
  q2,
  q3,
  q4,
  q5,
  stringsAsFactors=FALSE
)
leadership
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  45  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
4       4 10/12/08      UK      M  39  3  3  4 NA NA
5       5   5/1/09      UK      F  99  2  2  1  2  1

每一列表示一个变量,每一行表示一个观测

创建新变量

三种方式

赋值

mydata <- data.frame(
  x1=c(2, 2, 6, 4),
  x2=c(3, 4, 2, 8)
)
mydata$sumx <- mydata$x1 + mydata$x2
mydata$meanx <- (mydata$x1 + mydata$x2)/2

使用 attach() 省略对象名

attach(mydata)
mydata$sumx <- x1 + x2
mydata$meanx <- (x1 + x2)/2
detach(mydata)

transform() 函数

mydata <- transform(
  mydata,
  sumx=x1 + x2,
  meanx=(x1 + x2)/2
)

变量的重编码

修改现有变量

leadership$age[leadership$age == 99] <- NA
leadership
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  45  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
4       4 10/12/08      UK      M  39  3  3  4 NA NA
5       5   5/1/09      UK      F  NA  2  2  1  2  1

增加新变量

leadership$agecat[leadership$age > 75] <- "Elder"
leadership$agecat[
  leadership$age >= 55 &
  leadership$age <= 75
] <- "Middle Aged"
leadership$agecat[leadership$age < 55] <- "Young"
leadership
  manager     date country gender age q1 q2 q3 q4 q5 agecat
1       1 10/24/08      US      M  32  5  4  5  5  5  Young
2       2 10/28/08      US      F  45  3  5  2  5  5  Young
3       3  10/1/08      UK      F  25  3  5  5  5  2  Young
4       4 10/12/08      UK      M  39  3  3  4 NA NA  Young
5       5   5/1/09      UK      F  NA  2  2  1  2  1   <NA>

使用 within 简化代码

leadership <- within(
  leadership,
  {
    agecat <- NA
    agecat[age > 75] <- "Elder"
    agecat[age >= 55 & age <= 75] <- "Middle Aged"
    agecat[age < 55] <- "Young"
  }
)

变量的重命名

names() 函数

names(leadership)
 [1] "manager" "date"    "country" "gender"  "age"     "q1"      "q2"      "q3"     
 [9] "q4"      "q5"      "agecat" 
names(leadership)[2] <- "testDate"
leadership
  manager testDate country gender age q1 q2 q3 q4 q5 agecat
1       1 10/24/08      US      M  32  5  4  5  5  5  Young
2       2 10/28/08      US      F  45  3  5  2  5  5  Young
3       3  10/1/08      UK      F  25  3  5  5  5  2  Young
4       4 10/12/08      UK      M  39  3  3  4 NA NA  Young
5       5   5/1/09      UK      F  NA  2  2  1  2  1   <NA>

plyr 包中的 rename() 函数

library(plyr)

leadership <- data.frame(
  manager,
  date,
  country,
  gender,
  age,
  q1,
  q2,
  q3,
  q4,
  q5,
  stringsAsFactors=FALSE
)

leadership <- rename(
  leadership,
  c(
    manager="managerID",
    date="testDate"
  )
)
leadership
  managerID testDate country gender age q1 q2 q3 q4 q5
1         1 10/24/08      US      M  32  5  4  5  5  5
2         2 10/28/08      US      F  45  3  5  2  5  5
3         3  10/1/08      UK      F  25  3  5  5  5  2
4         4 10/12/08      UK      M  39  3  3  4 NA NA
5         5   5/1/09      UK      F  99  2  2  1  2  1

缺失值

is.na() 函数

is.na(leadership[, 6:10])
        q1    q2    q3    q4    q5
[1,] FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE  TRUE  TRUE
[5,] FALSE FALSE FALSE FALSE FALSE

NA 等特殊值无法使用逻辑关系符比较

na_var <- NA
na_var == NA
[1] NA

Inf-Inf 表示正无穷大和负无穷大

is.infinite(5/0)
[1] TRUE

NaN 表示不可能的值

is.nan(NaN)
[1] TRUE

重编码某些值为缺失值

leadership <- data.frame(
  manager,
  date,
  country,
  gender,
  age,
  q1,
  q2,
  q3,
  q4,
  q5,
  stringsAsFactors=FALSE
)

leadership$age[leadership$age == 99] <- NA
leadership
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  45  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
4       4 10/12/08      UK      M  39  3  3  4 NA NA
5       5   5/1/09      UK      F  NA  2  2  1  2  1

在分析中排除缺失值

x <- c(1, 2, NA, 3)
x[1] + x[2] + x[3] + x[4]
[1] NA
sum(x)
[1] NA

na.rm 选项移除缺失值

sum(x, na.rm=TRUE)
[1] 6

na.omit() 函数删除包含缺失值的观测,称为行删除。

注:该方法仅是处理缺失值的一种方法。

leadership
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  45  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2
4       4 10/12/08      UK      M  39  3  3  4 NA NA
5       5   5/1/09      UK      F  NA  2  2  1  2  1
trim_leadership <- na.omit(leadership)
trim_leadership
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  45  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2

日期

as.Date()

默认格式为 yyyy-mm-dd

as.Date(c("2007-06-22", "2004-02-13"))
[1] "2007-06-22" "2004-02-13"

格式字符串

as.Date(c("01/05/1965", "08/16/1975"), "%m/%d/%Y")
[1] "1965-01-05" "1975-08-16"
leadership$date <- as.Date(leadership$date, "%m/%d/%y")
leadership
  manager       date country gender age q1 q2 q3 q4 q5
1       1 2008-10-24      US      M  32  5  4  5  5  5
2       2 2008-10-28      US      F  45  3  5  2  5  5
3       3 2008-10-01      UK      F  25  3  5  5  5  2
4       4 2008-10-12      UK      M  39  3  3  4 NA NA
5       5 2009-05-01      UK      F  NA  2  2  1  2  1

Sys.Date() 获取日期

Sys.Date()
[1] "2020-12-28"

date() 获取时间

date()
[1] "Mon Dec 28 20:36:48 2020"

format() 函数格式化日期

today <- Sys.Date()
format(today, format="%B %d %y")
[1] "十二月 28 20"
format(today, format="%A")
[1] "星期一"

算术运算

start_date <- as.Date("2004-02-13")
end_date <- as.Date("2011-01-22")
days <- end_date - start_date
days
Time difference of 2535 days

difftime()

today <- Sys.Date()
dob <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")
Time difference of 3350.429 weeks

将日期转换为字符串变量

str_date <- as.character(today)
str_date
[1] "2020-12-28"

lubridate 包

library(lubridate)

解析日期和时间

ymd("20110604")
[1] "2011-06-04"
mdy("06-04-2011")
[1] "2011-06-04"
dmy("04/06/2011")
[1] "2011-06-04"

解析时间

arrive <- ymd_hms(
  "2011-06-04 12:00:00", 
  tz="Pacific/Auckland"
)
arrive
[1] "2011-06-04 12:00:00 NZST"
leave <- ymd_hms(
  "2011-08-10 14:00:00", 
  tz="Pacific/Auckland"
)
leave
[1] "2011-08-10 14:00:00 NZST"

设置和提取信息

second(arrive)
[1] 0
second(arrive) <- 25
arrive
[1] "2011-06-04 12:00:25 NZST"
wday(arrive)
[1] 7
wday(arrive, label=TRUE)
[1] 周六
Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六

时区

meeting <- ymd_hms(
  "2011-07-01 09:00:00", 
  tz="Pacific/Auckland"
)
with_tz(
  meeting, 
  "America/Chicago"
)
[1] "2011-06-30 16:00:00 CDT"
mistake <- force_tz(
  meeting,
  "America/Chicago"
)
with_tz(
  mistake,
  "Pacific/Auckland"
)
[1] "2011-07-02 02:00:00 NZST"

时间段

interval() 函数

auckland <- interval(arrive, leave)
auckland
[1] 2011-06-04 12:00:25 NZST--2011-08-10 14:00:00 NZST

%--% 运算符

auckland <- arrive %--% leave
auckland
[1] 2011-06-04 12:00:25 NZST--2011-08-10 14:00:00 NZST
jsm <- interval(
  ymd(20110720, tz="Pacific/Auckland"),
  ymd(20110831, tz="Pacific/Auckland")
)
jsm
[1] 2011-07-20 NZST--2011-08-31 NZST

是否有重叠

int_overlaps(jsm, auckland)
[1] TRUE
setdiff(auckland, jsm)
[1] 2011-06-04 12:00:25 NZST--2011-07-20 NZST

日期算术计算

Period

minutes(2)
[1] "2M 0S"

Duration

dminutes(2)
[1] "120s (~2 minutes)"

Duration 与 Period 的区别

leap_year(2011)
[1] FALSE
ymd(20110101) + dyears(1)
[1] "2012-01-01 06:00:00 UTC"
ymd(20110101) + years(1)
[1] "2012-01-01"
leap_year(2012)
[1] TRUE
ymd(20120101) + dyears(1)
[1] "2012-12-31 06:00:00 UTC"
ymd(20120101) + years(1)
[1] "2013-01-01"

计算

meetings <- meeting + weeks(0:5)
meetings
[1] "2011-07-01 09:00:00 NZST" "2011-07-08 09:00:00 NZST" "2011-07-15 09:00:00 NZST"
[4] "2011-07-22 09:00:00 NZST" "2011-07-29 09:00:00 NZST" "2011-08-05 09:00:00 NZST"

%within

meetings %within% jsm
[1] FALSE FALSE FALSE  TRUE  TRUE  TRUE
auckland / ddays(1)
[1] 67.08304
auckland / ddays(2)
[1] 33.54152
auckland / dminutes(1)
[1] 96599.58
auckland %/% months(1)
[1] 2
auckland %% months(1)
[1] 2011-08-04 12:00:25 NZST--2011-08-10 14:00:00 NZST

as.period() 函数

as.period(auckland %% months(1))
[1] "6d 1H 59M 35S"
as.period(auckland)
[1] "2m 6d 1H 59M 35S"

不合理时间

jan31 <- ymd("2013-01-31")
jan31 + months(0:11)
 [1] "2013-01-31" NA           "2013-03-31" NA           "2013-05-31" NA          
 [7] "2013-07-31" "2013-08-31" NA           "2013-10-31" NA           "2013-12-31"
floor_date(jan31, "month") + months(0:11) + days(31)
 [1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01" "2013-07-02"
 [7] "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01" "2013-12-02" "2014-01-01"
jan31 %m+% months(0:11)
 [1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31" "2013-06-30"
 [7] "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31" "2013-11-30" "2013-12-31"

向量化

lubridate 代码已开启向量化

类型转换

a <- c(1, 2, 3)
a
[1] 1 2 3
is.numeric(a)
[1] TRUE
a <- as.character(a)
a
[1] "1" "2" "3"
is.numeric(a)
[1] FALSE
is.vector(a)
[1] TRUE
is.character(a)
[1] TRUE

数据排序

order() 函数

new_data <- leadership[order(leadership$age),]
new_data
  manager       date country gender age q1 q2 q3 q4 q5
3       3 2008-10-01      UK      F  25  3  5  5  5  2
1       1 2008-10-24      US      M  32  5  4  5  5  5
4       4 2008-10-12      UK      M  39  3  3  4 NA NA
2       2 2008-10-28      US      F  45  3  5  2  5  5
5       5 2009-05-01      UK      F  NA  2  2  1  2  1

多重排序

attach(leadership)
new_data <- leadership[order(gender, age),]
detach(leadership)
new_data
  manager       date country gender age q1 q2 q3 q4 q5
3       3 2008-10-01      UK      F  25  3  5  5  5  2
2       2 2008-10-28      US      F  45  3  5  2  5  5
5       5 2009-05-01      UK      F  NA  2  2  1  2  1
1       1 2008-10-24      US      M  32  5  4  5  5  5
4       4 2008-10-12      UK      M  39  3  3  4 NA NA

逆序

attach(leadership)
new_data <- leadership[order(gender, -age),]
detach(leadership)
new_data
  manager       date country gender age q1 q2 q3 q4 q5
5       5 2009-05-01      UK      F  NA  2  2  1  2  1
2       2 2008-10-28      US      F  45  3  5  2  5  5
3       3 2008-10-01      UK      F  25  3  5  5  5  2
4       4 2008-10-12      UK      M  39  3  3  4 NA NA
1       1 2008-10-24      US      M  32  5  4  5  5  5

数据集的合并

向数据框添加列

merge() 函数

total <- merge(dataframeA, dataframeB, by="ID")
total <- merge(dataframeA, dataframeB, by=c("ID", "Country"))

向数据框添加行

total <- rbind(dataframeA, dataframeB)

数据集取子集

选入(保留)变量

变量序号

new_data <- leadership[, c(6:10)]
new_data
  q1 q2 q3 q4 q5
1  5  4  5  5  5
2  3  5  2  5  5
3  3  5  5  5  2
4  3  3  4 NA NA
5  2  2  1  2  1

变量名称

selected_vars <- c("q1", "q2", "q3", "q4", "q5")
new_data <- leadership[selected_vars]
new_data
  q1 q2 q3 q4 q5
1  5  4  5  5  5
2  3  5  2  5  5
3  3  5  5  5  2
4  3  3  4 NA NA
5  2  2  1  2  1
selected_vars <- paste("q", 1:5, sep="")
new_data <- leadership[selected_vars]
new_data
  q1 q2 q3 q4 q5
1  5  4  5  5  5
2  3  5  2  5  5
3  3  5  5  5  2
4  3  3  4 NA NA
5  2  2  1  2  1

剔除(丢弃)变量

变量名

selected_vars <- names(leadership) %in% c("q3", "q4")
selected_vars
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
new_data <- leadership[!selected_vars]
new_data
  manager       date country gender age q1 q2 q5
1       1 2008-10-24      US      M  32  5  4  5
2       2 2008-10-28      US      F  45  3  5  5
3       3 2008-10-01      UK      F  25  3  5  2
4       4 2008-10-12      UK      M  39  3  3 NA
5       5 2009-05-01      UK      F  NA  2  2  1

变量序号

new_data <- leadership[c(-8, -9)]
new_data
  manager       date country gender age q1 q2 q5
1       1 2008-10-24      US      M  32  5  4  5
2       2 2008-10-28      US      F  45  3  5  5
3       3 2008-10-01      UK      F  25  3  5  2
4       4 2008-10-12      UK      M  39  3  3 NA
5       5 2009-05-01      UK      F  NA  2  2  1

设置为 NULL

leadership$q3 <- leadership$q4 <- NULL
leadership
  manager       date country gender age q1 q2 q5
1       1 2008-10-24      US      M  32  5  4  5
2       2 2008-10-28      US      F  45  3  5  5
3       3 2008-10-01      UK      F  25  3  5  2
4       4 2008-10-12      UK      M  39  3  3 NA
5       5 2009-05-01      UK      F  NA  2  2  1

选入观测

leadership <- data.frame(
  manager,
  date,
  country,
  gender,
  age,
  q1,
  q2,
  q3,
  q4,
  q5,
  stringsAsFactors=FALSE
)

序号

new_data <- leadership[1:3, ]
new_data
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
2       2 10/28/08      US      F  45  3  5  2  5  5
3       3  10/1/08      UK      F  25  3  5  5  5  2

条件筛选

new_data <- leadership[
  leadership$gender=="M" &
  leadership$age > 30,
]
new_data
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
4       4 10/12/08      UK      M  39  3  3  4 NA NA
attach(leadership)
new_data <- leadership[
  gender == "M" & age > 30,
]
detach(leadership)
new_data
  manager     date country gender age q1 q2 q3 q4 q5
1       1 10/24/08      US      M  32  5  4  5  5  5
4       4 10/12/08      UK      M  39  3  3  4 NA NA

which() 函数

leadership$date <- as.Date(
  leadership$date, 
  "%m/%d/%y"
)

start_date <- as.Date("2009-01-01")
end_date <- as.Date("2009-10-31")

new_data <- leadership[
  which(
    leadership$date >= start_date & 
    leadership$date <= end_date
  ),
]
new_data
  manager       date country gender age q1 q2 q3 q4 q5
5       5 2009-05-01      UK      F  99  2  2  1  2  1

subset() 函数

new_data <- subset(
  leadership,
  age >= 35 | age < 24,
  select=c(q1, q2, q3, q4)
)
new_data
  q1 q2 q3 q4
2  3  5  2  5
4  3  3  4 NA
5  2  2  1  2
new_data <- subset(
  leadership,
  gender == "M" & age > 25,
  select=gender:q4
)
new_data
  gender age q1 q2 q3 q4
1      M  32  5  4  5  5
4      M  39  3  3  4 NA

随机抽样

sample()

sample_data <- leadership[
  sample(
    1:nrow(leadership),
    3,
    replace=FALSE
  ),
]
sample_data
  manager       date country gender age q1 q2 q3 q4 q5
5       5 2009-05-01      UK      F  99  2  2  1  2  1
1       1 2008-10-24      US      M  32  5  4  5  5  5
4       4 2008-10-12      UK      M  39  3  3  4 NA NA

使用 SQL 语句操作数据框

sqldf 包

library(sqldf)
df <- sqldf(
  "select * from mtcars where carb=1 order by mpg",
  row.names=TRUE
)
df
                mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
sqldf(
  paste(
    "select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear from mtcars",
    "where cyl in (4, 6) group by gear",
    sep=" "
  )
)
   avg_mpg avg_disp gear
1 20.33333 201.0333    3
2 24.53333 123.0167    4
3 25.36667 120.1333    5

参考

R 语言实战

R语言实战:图形初阶