ISLR实验:R语言简介

目录

本文源自《统计学习导论:基于R语言应用》(ISLR) 中《2.3 实验:R语言简介》章节

注:本文在 Jupyter Notebook 中运行,为了正常显示,绝大部分语句都添加了 print() 函数。如果在命令行交互模式下执行,则无需 print() 函数。

基本命令

c() 函数创建向量,<- 进行赋值

x <- c(1, 3, 2, 5)
print(x)
[1] 1 3 2 5

也可以使用 = 进行赋值,但不推荐

x = c(1, 6, 2)
print(x)
[1] 1 6 2
y <- c(1, 4, 3)

使用 length() 函数获取向量长度

print(length(x))
[1] 3
print(length(y))
[1] 3

向量加法

print(x + y)
[1]  2 10  5

ls() 函数查看所有的对象列表。

rm() 函数可以删除不需要的对象。

print(ls())
[1] "x" "y"
rm(x, y)
print(ls())
character(0)

删除所有对象

rm(list=ls())

matrix() 函数创建矩阵

x = matrix(
    data=c(1, 2, 3, 4),
    nrow=2,
    ncol=2
)
print(x)
     [,1] [,2]
[1,]    1    3
[2,]    2    4

可以省略参数名称

x = matrix(
    c(1, 2, 3, 4),
    2,
    2
)
print(x)
     [,1] [,2]
[1,]    1    3
[2,]    2    4

创建矩阵时默认先填列再填行,使用 byrow=TRUE 表示先填行再填列

print(matrix(
    c(1, 2, 3, 4),
    2,
    2,
    byrow=TRUE
))
     [,1] [,2]
[1,]    1    2
[2,]    3    4
print(sqrt(x))
         [,1]     [,2]
[1,] 1.000000 1.732051
[2,] 1.414214 2.000000
print(x^2)
     [,1] [,2]
[1,]    1    9
[2,]    4   16

rnorm() 函数产生随机正态分布的向量

cor() 计算变量之间的相关系数

x = rnorm(50)
y = x + rnorm(50, mean=50, sd=.1)
print(cor(x, y))
[1] 0.9960602

set.seed() 设置随机数种子

set.seed(1303)
print(rnorm(50))
 [1] -1.1439763145  1.3421293656  2.1853904757  0.5363925179  0.0631929665
 [6]  0.5022344825 -0.0004167247  0.5658198405 -0.5725226890 -1.1102250073
[11] -0.0486871234 -0.6956562176  0.8289174803  0.2066528551 -0.2356745091
[16] -0.5563104914 -0.3647543571  0.8623550343 -0.6307715354  0.3136021252
[21] -0.9314953177  0.8238676185  0.5233707021  0.7069214120  0.4202043256
[26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
[31]  1.5732737361  0.0127465055  0.8726470499  0.4220661905 -0.0188157917
[36]  2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412  1.3677342065
[41]  0.2640073322  0.6321868074 -1.3306509858  0.0268888182  1.0406363208
[46]  1.3120237985 -0.0300020767 -0.2500257125  0.0234144857  1.6598706557

mean() 计算均值

var() 计算方差

sd() 计算标准差

set.seed(3)
y = rnorm(100)
print(mean(y))
[1] 0.01103557
print(var(y))
[1] 0.7328675
print(sqrt(var(y)))
[1] 0.8560768
print(sd(y))
[1] 0.8560768

图形

plot() 函数

x = rnorm(100)
y = rnorm(100)
plot(x, y)

添加标题

plot(
    x, y,
    xlab="this is the x-axis",
    ylab="this is the y-axis",
    main="Plot of X vs Y",
)

使用 pdf()jpeg() 等函数保存图片

pdf("Figure.pdf")
plot(x, y, col="green")
dev.off()

seq() 用于生成序列

x = seq(1, 10)
print(x)
 [1]  1  2  3  4  5  6  7  8  9 10
x = 1:10
print(x)
 [1]  1  2  3  4  5  6  7  8  9 10
x = seq(-pi, pi, length=50)

contour 函数用于绘制等值线图

y = x
f = outer(x, y, function(x,y) cos(y)/(1 + x^2))
contour(x, y, f)
contour(
    x, y, f, 
    nlevels=45,
    add=T,
)
fa = (f - t(f)) / 2
contour(
    x, y, fa,
    nlevels=15
)

image() 函数绘制 heatmap

image(x, y, fa)

persp() 函数绘制三维图

persp(x, y, fa)
persp(x, y, fa, theta=30)
persp(x, y, fa, theta=30, phi=20)
persp(x, y, fa, theta=30, phi=70)
persp(x, y, fa, theta=30, phi=40)

索引数据

A = matrix(1:16, 4, 4)
print(A)
     [,1] [,2] [,3] [,4]
[1,]    1    5    9   13
[2,]    2    6   10   14
[3,]    3    7   11   15
[4,]    4    8   12   16
print(A[2, 3])
[1] 10
print(A[c(1, 3), c(2, 4)])
     [,1] [,2]
[1,]    5   13
[2,]    7   15
print(A[1:3, 2:4])
     [,1] [,2] [,3]
[1,]    5    9   13
[2,]    6   10   14
[3,]    7   11   15
print(A[1:2,])
     [,1] [,2] [,3] [,4]
[1,]    1    5    9   13
[2,]    2    6   10   14
print(A[, 1:2])
     [,1] [,2]
[1,]    1    5
[2,]    2    6
[3,]    3    7
[4,]    4    8
print(A[1,])
[1]  1  5  9 13
print(A[-c(1,3),])
     [,1] [,2] [,3] [,4]
[1,]    2    6   10   14
[2,]    4    8   12   16

负数索引用于去掉对应的元素

print(A[-c(1, 3), -c(1, 3, 4)])
[1] 6 8

dim() 函数返回行数和列数

print(dim(A))
[1] 4 4

载入数据

使用 Auto.data 文件

Auto = read.table("Auto.data")
fix(Auto)

将问号识别为 NA

Auto = read.table(
    "Auto.data",
    header=T,
    na.strings="?"
)
fix(Auto)
Auto = read.csv(
    "Auto.csv",
    header=T,
    na.strings="?"
)
print(dim(Auto))
[1] 397   9
Auto[1:4,]

删掉有 NA 的行

Auto = na.omit(Auto)
print(dim(Auto))
[1] 392   9
print(names(Auto))
[1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
[6] "acceleration" "year"         "origin"       "name"     

其他的图形和数值汇总

使用 plot() 绘制定量变量的散点图

plot(Auto$cylinders, Auto$mpg)

使用 attach() 函数指定数据集后,可以直接使用数据集中的数据

attach(Auto)
plot(cylinders, mpg)

as.factor 将定量变量转换成定性变量

cylinders = as.factor(cylinders)
plot(cylinders, mpg)
plot(cylinders, mpg, col="red")
plot(
    cylinders, mpg, 
    col="red", 
    varwidth=T
)
plot(
    cylinders, mpg, 
    col="red", 
    varwidth=T, 
    horizontal=T
)
plot(
    cylinders, mpg, 
    col="red", 
    varwidth=T, 
    xlab="cylinders",
    ylab="MPG"
)

hist() 函数绘制直方图

hist(mpg)
hist(mpg, col=2)
hist(mpg, col=2, breaks=15)

pairs() 函数建立数据集中每一对变量的散点图矩阵

# 会抛出错误
# pairs(Auto)
pairs(~ mpg + displacement + horsepower + weight + acceleration, Auto)

identify() 函数用于交互

plot(horsepower, mpg)
identify(horsepower, mpg, name)

summary() 显示数据汇总信息

summary(Auto)
      mpg          cylinders      displacement     horsepower        weight    
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
 Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
 Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
  acceleration        year           origin          name          
 Min.   : 8.00   Min.   :70.00   Min.   :1.000   Length:392        
 1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   Class :character  
 Median :15.50   Median :76.00   Median :1.000   Mode  :character  
 Mean   :15.54   Mean   :75.98   Mean   :1.577                     
 3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000                     
 Max.   :24.80   Max.   :82.00   Max.   :3.000  
summary(mpg)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   9.00   17.00   22.75   23.45   29.00   46.60 

参考

学习 R 语言:快速入门

学习 R 语言:向量

学习 R 语言:矩阵和数组