学习R语言:R与其他语言的接口

目录

本文内容来自《R 语言编程艺术》(The Art of R Programming),有部分修改

介绍在 R 语言中调用 C/C++ 和在 Python 中调用 R。

编写能被 R 调用的 C/C++ 函数

通过 .C().Call() 实现

R 与 C/C++ 交互的预备知识

C 语言中二维数组按行存储,R 语言中按列存储

C 语言中下标从 0 开始,R 语言中从 1 开始

R 传递给 C 的参数都是以指针形式存在的,C 语言函数的返回值必须是 void。 从 C 函数返回值必须通过函数的参数。

示例:提取方阵的次对角线元素

#include <R.h>

void subdiag(
  double *m,
  int *n,
  int *k,
  double *result
){
  int nval = *n;
  int kval = *k;
  int stride = nval + 1;
  for(int i = 0, j = kval; i < nval - kval; ++i, j+=stride) {
    result[i] = m[i];
  }
}

执行命令

R CMD SHLIB sd.c

输出

"C:/rtools40/mingw64/bin/"gcc  -I"C:/lang/R/R-4.0.3/include" -DNDEBUG -O2 -Wall  -std=gnu99 -mfpmath=sse -msse2 -mstackrealign -c sd.c -o sd.o
C:/rtools40/mingw64/bin/gcc -shared -s -static-libgcc -o sd.dll tmp.def sd.o -LC:/lang/R/R-4.0.3/bin/x64 -lR
dyn.load("sd.dll")
m <- rbind(
  1:5, 
  6:10, 
  11:15, 
  16:20, 
  21:25
)
k <- 2
.C(
  "subdiag",
  as.double(m),
  as.integer(dim(m)[1]),
  as.integer(k),
  result=double(dim(m)[1] - k)
)
[[1]]
 [1]  1  6 11 16 21  2  7 12 17 22  3  8 13 18 23  4  9 14 19 24  5 10
[23] 15 20 25

[[2]]
[1] 5

[[3]]
[1] 2

$result
[1]  1  6 11

扩展案例:预测离散取值的事件序列

y <- sample(0:1, 1000000, replace=T)
pred_v1 <- function(x, k) {
  n <- length(x)
  k2 <- k/2
  pred <- vector(length=n-k)
  for (i in 1:(n-k)) {
    if(sum(x[i:(i+k-1)]) >= k2) {
      pred[i] <- i
    } else {
      pred[i] <- 0
    }
  }
  return (mean(abs(pred-x[(k+1):n])))
}
system.time(pred_v1(y, 1000))
   user  system elapsed 
   4.67    0.03    4.72 
pred_v2 <- function(x, k) {
  n <- length(x)
  k2 <- k/2
  pred <- vector(length=n-k)
  sm <- sum(x[1:k])
  if (sm >= k2) pred[1] <- 1 else pred[1] <- 0
  if (n - k >= 2) {
    for (i in 2:(n-k)) {
      sm <- sm + x[i+k-1] - x[i-1]
      if (sm >= k2) pred[i] <- 1 else pred[i] <- 0
    }
  }
  return(mean(abs(pred - x[(k+1):n])))
}
system.time(pred_v2(y, 1000))
   user  system elapsed 
   0.19    0.00    0.20 

使用 filter() 函数

pred_v3 <- function(x, k) {
  n <- length(x)
  f <- filter(x, rep(1, k), sides=1)[k:(n-1)]
  k2 <- k/2
  pred <- as.integer(f >= k2)
  return(mean(abs(pred - x[(k+1):n])))
}

计算时间不如上一个版本

system.time(pred_v3(y, 1000))
   user  system elapsed 
   2.88    0.00    2.89 

C 语言版本

#include <R.h>

void pred_v4(int *x, int *n, int *k, double *errrate) {
  int nval = *n;
  int kval = *k;
  int nk = nval - kval;
  int sm = 0;
  int errs = 0;
  int pred;
  
  double k2 = kval / 2.0;
  
  for (int i=0; i< kval; i++) {
    sm += x[i];
  }
  
  if (sm >= k2) {
    pred = 1;
  } else {
    pred = 0;
  }
  
  errs = abs(pred - x[kval]);
  
  for (int i=1; i < nk; i++) {
    sm = sm + x[i+kval-1] - x[i-1];
    if (sm >= k2) {
      pred = 1;
    } else {
      pred = 0;
    }
    errs += abs(pred - x[i+kval]);
  }
  *errrate = (double) errs / nk;
}
R CMD SHLIB pred.c
"C:/rtools40/mingw64/bin/"gcc  -I"C:/lang/R/R-4.0.3/include" -DNDEBUG -O2 -Wall  -std=gnu99 -mfpmath=sse -msse2 -mstackrealign -c pred.c -o pred.o
C:/rtools40/mingw64/bin/gcc -shared -s -static-libgcc -o pred.dll tmp.def pred.o -LC:/lang/R/R-4.0.3/bin/x64 -lR
dyn.load("pred.dll")
system.time(
  .C(
    "pred_v4",
    as.integer(y),
    as.integer(length(y)),
    as.integer(1000),
    errrate=double(1)
  )
)
   user  system elapsed 
      0       0       0 

从 Python 调用 R

使用 rpy2 包在 Python 中使用 R

使用 Anaconda 安装

conda install -c conda-forge r-base=4.0.3 rpy2

Windows 下需要单独设置

import os
os.environ["R_HOME"] = r"C:\Users\windroc\Anaconda3\envs\nwpc-data\lib\R"
os.environ["PATH"]   = r"C:\Users\windroc\Anaconda3\envs\nwpc-data\lib\R\bin\x64" + ";" + os.environ["PATH"]

以下代码来自 rpy2 官方文档:

https://rpy2.github.io/doc/latest/html/introduction.html

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr

载入 R 包

r_base = importr('base')

执行 R 代码

执行字符串形式的代码

pi = robjects.r("pi")
pi[0]
3.141592653589793
robjects.r('''
    # create a function `f`
    f <- function(r, verbose=FALSE) {
        if (verbose) {
            cat("I am calling f().\n")
        }
        2 * pi * r
    }
    # call the function `f` with argument value 3
    f(3)
''')
FloatVector with 1 elements.
18.849556
r_f = robjects.globalenv['f']
print(r_f.r_repr())
function (r, verbose = FALSE) 
{
    if (verbose) {
        cat("I am calling f().\n")
    }
    2 * pi * r
}
res = r_f(3)
res
FloatVector with 1 elements.
18.849556

向量

len(robjects.r["pi"])
1
robjects.r["pi"][0]
3.141592653589793

创建向量

res = robjects.StrVector([
    "abc",
    "def"
])
print(res.r_repr())
c("abc", "def")
res = robjects.IntVector([
    1, 2, 3
])
print(res.r_repr())
1:3
res = robjects.FloatVector([
    1.1, 2.2, 3.3
])
print(res.r_repr())
c(1.1, 2.2, 3.3)
v = robjects.FloatVector([
    1.1, 2.2, 3.3,
    4.4, 5.5, 6.6,
])
m = robjects.r["matrix"](v, nrow=2)
print(m)
     [,1] [,2] [,3]
[1,]  1.1  3.3  5.5
[2,]  2.2  4.4  6.6

调用 R 函数

rsum = robjects.r["sum"]
rsum(robjects.IntVector([1, 2, 3]))[0]
6
rsort = robjects.r["sort"]
res = rsort(robjects.IntVector([1, 2, 3]), decreasing=True)
print(res.r_repr())
3:1

示例

from rpy2.robjects import r

r_stats = importr("stats")
a = robjects.IntVector([5, 12, 13])
b = robjects.IntVector([10, 28, 30])
robjects.globalenv["v1"] = a
robjects.globalenv["v2"] = b
lm_result = stats.lm("v2 ~ v1")
print(r_base.summary(lm_result))
...skip...

Residuals:
       1        2        3 
-0.03509  0.28070 -0.24561 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept) -2.59649    0.64508  -4.025   0.1550  
v1           2.52632    0.06077  41.569   0.0153 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3746 on 1 degrees of freedom
Multiple R-squared:  0.9994,	Adjusted R-squared:  0.9988 
F-statistic:  1728 on 1 and 1 DF,  p-value: 0.01531
print(stats.anova(lm_result))
Analysis of Variance Table

Response: v2
          Df Sum Sq Mean Sq F value  Pr(>F)  
v1         1 242.53  242.53    1728 0.01531 *
Residuals  1   0.14    0.14                  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
print(lm_result.names)
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"     
print(lm_result.rx('coefficients'))
$coefficients
(Intercept)          v1 
  -2.596491    2.526316

参考

学习 R 语言系列文章

快速入门

向量

矩阵和数组

列表

数据框

因子和表

编程结构

数学运算与模拟

面向对象编程

输入与输出

字符串操作

基础绘图

性能提升——速度和内存

本文代码请访问如下项目:

https://github.com/perillaroc/the-art-of-r-programming