error gathering device information while adding custom device

shim/vendor/github.com/docker/docker/oci/devices_linux.go:85:   return devs, devPermissions, fmt.Errorf("error gathering device information while adding custom device %q: %s", pathOnHost, err)
shim/vendor/github.com/moby/moby/oci/devices_linux.go:85:       return devs, devPermissions, fmt.Errorf("error gathering device information while adding custom device %q: %s", pathOnHost, err)

Here is some code I found in another kernel module, which uses the sys_* calls:

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/delay.h>
#include <linux/string.h>
#include <linux/syscalls.h>
/* Snip */

int openflags = O_WRONLY|O_CREAT;
if (ml != 1)
        openflags |= O_TRUNC;
wfd = sys_open(collected, openflags, mode);

if (wfd >= 0) {
    sys_fchown(wfd, uid, gid);
    sys_fchmod(wfd, mode);
    state = CopyFile;
}
Also found:

asmlinkage long sys_rename(const char __user *oldname, const char __user *newname);
asmlinkage long sys_chmod(const char __user *filename, mode_t mode);
asmlinkage long sys_fchmod(unsigned int fd, mode_t mode);

func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error)

chroot pivotRoot

drivers/char/random.c

func setupDevSymlinks(rootfs string) error {
    var links = [][2]string{
        {"/proc/self/fd", "/dev/fd"},
        {"/proc/self/fd/0", "/dev/stdin"},
        {"/proc/self/fd/1", "/dev/stdout"},
        {"/proc/self/fd/2", "/dev/stderr"},
    }

    // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
    // in /dev if it exists in /proc.
    if _, err := os.Stat("/proc/kcore"); err == nil {
        links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
    }

    for _, link := range links {
        var (
            src = link[0]
            dst = filepath.Join(rootfs, link[1])
        )

        if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
            return fmt.Errorf("symlink %s %s %s", src, dst, err)
        }
    }

    return nil
}

// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
        data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
        if err != nil {
                return 0, err
        }
        return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
}

runc 启动容器过程分析（附 CVE-2019-5736 实现过程）

2019, FEB 15

环境

OCI runtime spec 地址：https://github.com/opencontainers/runtime-spec
runc 地址：https://github.com/opencontainers/runc/
Commit：f414f497b50a61750ea3af9fccf998a3db687cea
系统版本：Fedora Release 28
内核版本：4.17.9-200.fc28.x86_64

runc 介绍

runc 实现了 OCI 的容器标准，能够管理容器的生命周期。runc 的详细功能请参考帮助文档。

runc 不是基于 server 形式的，所以所有的配置和状态都会存储在本地文件系统中（以下均为使用 docker 时的默认路径）：

容器配置：/run/docker/libcontainerd/{cnotainer-id}/config.json
容器 init 进程的标准输入输出流：/run/docker/libcontainerd/{cnotainer-id}/{init-stdin,init-stdout,init-stderr}
容器状态信息：/run/runc/*/state.json

runc 创建容器时会将状态记录到 state.json 中，所有查询都是从 state.json 中取得容器基本信息，然后再从系统中获取容器实时状态。

docker 的调用链如下：

docker-client -> dockerd -> docker-containerd -> docker-containerd-shim -> runc（容器外） -> runc（容器内） -> containter-entrypoint

runc 启动容器过程

runc 在被 docker-containerd-shim 调用时，参数中会指定容器的配置路径（即 config.json 的位置），同时容器的根路径也已经准备完毕，因此 runc 不会有跟镜像相关的概念。容器的启动过程分析直接从 runc run 开始，即 docker 调用链中的 runc（容器外）这个时间点。

runc（容器外）环境准备

读取 config.json（github.com/opencontainers/runc/run.go#65）：

// 读取 config.json
spec, err := setupSpec(context)
if err != nil {
	return err
}
// 启动容器
status, err := startContainer(context, spec, CT_ACT_RUN, nil)
if err == nil {
	os.Exit(status)
}
return err

startContainer 创建容器信息，并启动（github.com/opencontainers/runc/utils_linux.go#396）：

func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
    // 通过 spec 创建容器结构，在 createContainer 中将 spec 转换为了 runc 的 container config
	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}
    // 构建 runner 启动容器
	r := &runner{
		// 容器
		container:       container,
		// 即 CT_ACT_RUN
		action:          action,
		// 用于设置 process.Init 字段
		init:            true,
	}
	return r.run(spec.Process)
}

r.run() 启动容器（github.com/opencontainers/runc/utils_linux.go#268）：

func (r *runner) run(config *specs.Process) (int, error) {
	// 根据 config 构建容器进程，此处 r.init 为 true
	process, err := newProcess(*config, r.init)
	if err != nil {
		r.destroy()
		return -1, err
	}

    // 根据 action 调用 container 的对应方法
	switch r.action {
	case CT_ACT_CREATE:
		err = r.container.Start(process)
	case CT_ACT_RESTORE:
		err = r.container.Restore(process, r.criuOpts)
    case CT_ACT_RUN:
        // 此处调用的是这个方法
		err = r.container.Run(process)
	default:
		panic("Unknown action")
	}
}

container 是由 createContainer() 方法创建，根据创建链路 createContainer() -> loadFactory() -> libcontainer.New() 确认容器由 LinuxFactory.Create() 创建：

// github.com/opencontainers/runc/libcontainer/factory_linux.go#132
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
	l := &LinuxFactory{
        // 指向当前的 exe 程序，即 runc 本身
        InitPath:  "/proc/self/exe",
        // os.Args[0] 是当前 runc 的路径，本质上和 InitPath 是一样的，即 runc init
		InitArgs:  []string{os.Args[0], "init"},
	}
	return l, nil
}

// github.com/opencontainers/runc/libcontainer/factory_linux.go#189
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
    // 创建 linux 容器结构
	c := &linuxContainer{
        // 容器 ID
        id:            id,
        // 容器状态文件存放目录，默认是 /run/runc/{容器 id}/
        root:          containerRoot,
        // 容器配置
        config:        config,
        // 即 /proc/self/exe，就是 runc
        initPath:      l.InitPath,
        // 即 runc init
		initArgs:      l.InitArgs,
	}
	return c, nil
}

所以整个容器的启动逻辑在 linuxContainer.Run() 里，调用链是 linuxContainer.Run() -> linuxContainer.Start() -> linuxContainer.start()：

// github.com/opencontainers/runc/libcontainer/container_linux.go#334
func (c *linuxContainer) start(process *Process) error {
    // process 是容器的 entrypoint，此处创建的是 entrypoint 的父进程
	parent, err := c.newParentProcess(process)
	if err != nil {
		return newSystemErrorWithCause(err, "creating new parent process")
    }
    // 启动父进程
	if err := parent.start(); err != nil {
		// terminate the process to ensure that it properly is reaped.
		if err := ignoreTerminateErrors(parent.terminate()); err != nil {
			logrus.Warn(err)
		}
		return newSystemErrorWithCause(err, "starting container process")
	}
}

func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
    // 创建用于父子进程通信的 pipe
	parentPipe, childPipe, err := utils.NewSockPair("init")
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new init pipe")
    }
    // 创建父进程的 cmd
	cmd, err := c.commandTemplate(p, childPipe)
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new command template")
	}
	if !p.Init {
        // 由于 p.Init 为 true，所以不会执行到这里
		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
	}

    // 返回标准 init 进程
	return c.newInitProcess(p, cmd, parentPipe, childPipe)
}

func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
    // 这里可以看到 cmd 就是 runc init
	cmd := exec.Command(c.initPath, c.initArgs[1:]...)
    cmd.Args[0] = c.initArgs[0]
    // 将设置给容器 entrypoint 的 std 流给了 runc init 命令，这些流最终会通过 runc init 传递给 entrypoint 
	cmd.Stdin = p.Stdin
	cmd.Stdout = p.Stdout
    cmd.Stderr = p.Stderr
    
    // 这个 childPipe 用于跟父进程通信（父进程就是当前这个 runc 进程）
    cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
    // 通过环境变量 _LIBCONTAINER_INITPIPE 把 fd 号传递给 runc init，由于 std 流会占用前三个 fd 编号（0，1，2）
    // 所以 fd 要加上 3（stdioFdCount）
    cmd.Env = append(cmd.Env,
		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
	)
	return cmd, nil
}

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
    // 这里通过环境变量 _LIBCONTAINER_INITTYPE 设置 init 类型为 standard（initStandard）
	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
	nsMaps := make(map[configs.NamespaceType]string)
	for _, ns := range c.config.Namespaces {
		if ns.Path != "" {
			nsMaps[ns.Type] = ns.Path
		}
	}
    _, sharePidns := nsMaps[configs.NEWPID]
    // 构造 namespace 设置，然后序列化成字节数据
	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
	if err != nil {
		return nil, err
	}
	init := &initProcess{
		cmd:             cmd,
		childPipe:       childPipe,
		parentPipe:      parentPipe,
		manager:         c.cgroupManager,
        intelRdtManager: c.intelRdtManager,
        
		config:          c.newInitConfig(p),
		container:       c,
		process:         p,
		bootstrapData:   data,
		sharePidns:      sharePidns,
	}
	c.initProcess = init
	return init, nil
}

在 linuxContainer.start() 中，创建了一个命令是 runc init 的初始化进程（initProcess），并启动了该进程，这里是 runc（容器外）的最核心的逻辑：

// github.com/opencontainers/runc/libcontainer/process_linux.go#262
func (p *initProcess) start() error {
    defer p.parentPipe.Close()
    // 启动了 cmd，即启动了 runc init
	err := p.cmd.Start()
	p.process.ops = p
	p.childPipe.Close()
	if err != nil {
		p.process.ops = nil
		return newSystemErrorWithCause(err, "starting init process command")
	}

    // 将 bootstrapData 写入到 parent pipe 中，此时 runc init 可以从 child pipe 里读取到这个数据
	if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
		return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
    }
    
    // 获取子进程的 PID，即 runc init 的 PID
    childPid, err := p.getChildPid()
	if err != nil {
		return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
	}

	// 如果子容器的配置中要求创建新的 CGROUP Namespace，那么这里还要向 parent pipe 写入一个字节的数据 0x80（createCgroupns）
	if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
		if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
			return newSystemErrorWithCause(err, "sending synchronization value to init process")
		}
	}

	// 等待 runc init 退出
	if err := p.waitForChildExit(childPid); err != nil {
		return newSystemErrorWithCause(err, "waiting for our first child to exit")
	}
    
    // 向 parent pipe 中写入 container config，也就是把容器配置传递给了 runc init
    // 为什么 runc init 都退出了，还要往里面写配置？==》这个问题下面说到 runc init 的时候再解释
	if err := p.sendConfig(); err != nil {
		return newSystemErrorWithCause(err, "sending config to init process")
	}
	var (
		sentRun    bool
		sentResume bool
	)
    // 从 parent pipe 中读取来自 runc init 的同步消息
	ierr := parseSync(p.parentPipe, func(sync *syncT) error {
		...
		return nil
	})
	return nil
}

总结：

runc 被 docker-containerd-shim 调用后，从 config.json 中读取 container spec，并转换成内部 config
这个 runc 在外部运行，拥有 root 权限
runc 启动了一个子进程，runc init，然后通过 pipe 将 bootstrapData（含有 namespace 信息），0x80（NEWCGROUP），容器 config 传输给 runc init，并开始等待 runc init 的同步消息

runc（容器内）启动过程

原则上来说，容器外的 runc 启动的 runc init 仍然是在容器外部的，但是它会逐步的限制自身的 namespace 来构建容器环境，因此这里直接算作容器内的 runc。

runc init 命令启动：

package main

import (
	"os"
	"runtime"

    "github.com/opencontainers/runc/libcontainer"
    // 这个包非常重要，是 runc init 启动的基石
	_ "github.com/opencontainers/runc/libcontainer/nsenter"
	"github.com/urfave/cli"
)

func init() {
	if len(os.Args) > 1 && os.Args[1] == "init" {
		runtime.GOMAXPROCS(1)
		runtime.LockOSThread()
	}
}

var initCommand = cli.Command{
	Name:  "init",
	Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
	Action: func(context *cli.Context) error {
        // 构造了一个空的 factory
        factory, _ := libcontainer.New("")
        // 初始化容器环境
		if err := factory.StartInitialization(); err != nil {
			os.Exit(1)
		}
		panic("libcontainer: container init failed to exec")
	},
}

由于 nsenter 包被匿名引入，而且利用了 GCC 构造器特性，导致 go 的代码最后才会执行，因此先看 nsenter 包的代码（github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go）：

// +build linux,!gccgo

package nsenter

/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
	nsexec();
}
*/
import "C"

这个代码利用了 GCC 的 constructor 特性，init 会在 runtimel.main()（不是 main.main()）函数之前执行，这样保证了启动时是单线程的，这一点很重要。因为 linux 不允许在多线程中通过 setns 设置 user namespace。

这个初始化函数调用了 nsexec()（github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c#540）：

void nsexec(void)
{
	int pipenum;
	jmp_buf env;
	int sync_child_pipe[2], sync_grandchild_pipe[2];
	struct nlconfig_t config = { 0 };

	// 从环境变量 _LIBCONTAINER_INITPIPE 中取得 child pipe 的 fd 编号
	pipenum = initpipe();
    if (pipenum == -1)
        // 由于正常启动的 runc 是没有这个环境变量的，所以这里会直接返回，然后就开始正常的执行 go 程序了
		return;

    // 确保当前的二进制文件是已经复制过的，用来规避 CVE-2019-5736 漏洞
    // ensure_cloned_binary 中使用了两种方法：
    // - 使用 memfd，将二进制文件写入 memfd，然后重启 runc
    // - 复制二进制文件到临时文件，然后重启 runc
	if (ensure_cloned_binary() < 0)
		bail("could not ensure we are a cloned binary");

	// 从 child pipe 中读取 namespace config
	nl_parse(pipenum, &config);

	// 设置 oom score，这个只能在特权模式下设置，所以在这里就要修改完成
	update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);

	// 设置不可 dump
	if (config.namespaces) {
		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
			bail("failed to set process as non-dumpable");
	}

	// 创建和子进程通信的 pipe，为什么有这个 pipe，下面解释
	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
		bail("failed to setup sync pipe between parent and child");

	// 创建和孙进程通信的 pipe，为什么有这个 pipe，下面解释
	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
        bail("failed to setup sync pipe between parent and grandchild");
    
    // setjmp 将当前执行位置的环境保存下来，用于多进程环境下的程序跳转
    // 第一次执行的时候 setjmp 返回 0，对应 JUMP_PARENT
	switch (setjmp(env)) {
	case JUMP_PARENT:{
			int len;
			pid_t child, first_child = -1;
			bool ready = false;

			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);

            // clone_parent 创建了和当前进程完全一致的一个进程（子进程）
            // 在 clone_parent 中，通过 longjmp() 跳转到 env 保存的位置
            // 并且 setjmp 返回值为 JUMP_CHILD
            // 这样这个子进程就会根据 switch 执行到 JUMP_CHILD 分支
            // 而当前 runc init 和 子 runc init 之间通过上面创建的
            // sync_child_pipe 进行同步通信
			child = clone_parent(&env, JUMP_CHILD);
			if (child < 0)
				bail("unable to fork: child_func");

            // 通过 sync_child_pipe 循环读取来自子进程的消息
			while (!ready) {
				enum sync_t s;
				int ret;

				syncfd = sync_child_pipe[1];
				close(sync_child_pipe[0]);

				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with child: next state");

				switch (s) {
				case SYNC_ERR:
					/* We have to mirror the error code of the child. */
					if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
						bail("failed to sync with child: read(error code)");

					exit(ret);
				case SYNC_USERMAP_PLS:
					// 这里设置 user map，因为子进程修改自身的 user namespace 之后，就没有权限再设置 user map 了

					if (config.is_rootless_euid && !config.is_setgroup)
						update_setgroups(child, SETGROUPS_DENY);

					/* Set up mappings. */
					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);

                    // 向子进程发送 SYNC_USERMAP_ACK，表示处理完成
					s = SYNC_USERMAP_ACK;
					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
						kill(child, SIGKILL);
						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
					}
					break;
				case SYNC_RECVPID_PLS:{
						first_child = child;
                        // 接收孙进程（还是 runc init）的 pid
						/* Get the init_func pid. */
						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
							kill(first_child, SIGKILL);
							bail("failed to sync with child: read(childpid)");
						}

						// 向子进程发送 SYNC_RECVPID_ACK，表示处理完成
						s = SYNC_RECVPID_ACK;
						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
							kill(first_child, SIGKILL);
							kill(child, SIGKILL);
							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
						}

                        // 通过容器外传进来的 child pipe 把子和孙进程 PID，写回去，然后让容器外的 runc 接管 PID
                        // 这个是因为 clone_parent 的时候参数传了 CLONE_PARENT，导致子孙的父进程都是容器外的那个 runc
                        // 所以当前进程无法接管这些 PID
						len = dprintf(pipenum, "{"pid": %d, "pid_first": %d}
", child, first_child);
						if (len < 0) {
							kill(child, SIGKILL);
							bail("unable to generate JSON for child pid");
						}
					}
					break;
                case SYNC_CHILD_READY:
                    // 子进程已经处理完了所有事情，退出循环
					ready = true;
					break;
				default:
					bail("unexpected sync value: %u", s);
				}
			}

            // 通过 sync_grandchild_pipe 循环读取来自孙进程的消息
			ready = false;
			while (!ready) {
				enum sync_t s;
				int ret;

				syncfd = sync_grandchild_pipe[1];
				close(sync_grandchild_pipe[0]);

				s = SYNC_GRANDCHILD;
				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
					kill(child, SIGKILL);
					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
				}

				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with child: next state");

				switch (s) {
				case SYNC_ERR:
					if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
						bail("failed to sync with child: read(error code)");

					exit(ret);
                case SYNC_CHILD_READY:
                    // 等待孙进程准备完成
					ready = true;
					break;
				default:
					bail("unexpected sync value: %u", s);
				}
            }
            // 退出。很明显，当前 runc init 退出的时候，子 runc init 一定也退出了，但是孙 runc init 还没有退出
            // 这也是为什么容器外的 runc 等待子进程退出，却又向 pipe 里写数据的原因，因为孙 runc init 还在等着容器配置
            // 进程正常退出（不给 go 代码执行的机会）
			exit(0);
		}
	case JUMP_CHILD:{
			pid_t child;
			enum sync_t s;

			/* We're in a child and thus need to tell the parent if we die. */
			syncfd = sync_child_pipe[0];
			close(sync_child_pipe[1]);

			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);

			// 通过 setns 加入现有的 namespace
			if (config.namespaces)
				join_namespaces(config.namespaces);

            // 如果 clone flag 里有 CLONE_NEWUSER，说明需要创建新的 user namespace，此处调用 unshare 进行了处理
			if (config.cloneflags & CLONE_NEWUSER) {
				if (unshare(CLONE_NEWUSER) < 0)
					bail("failed to unshare user namespace");
				config.cloneflags &= ~CLONE_NEWUSER;

				if (config.namespaces) {
					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
						bail("failed to set process as dumpable");
                }
                
                // 等待父 runc init 配置 user map
				s = SYNC_USERMAP_PLS;
				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");

				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
				if (s != SYNC_USERMAP_ACK)
					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);

				if (config.namespaces) {
					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
						bail("failed to set process as dumpable");
				}

				// 设置当前进程的 uid 为 0，即容器内的 root
				if (setresuid(0, 0, 0) < 0)
					bail("failed to become root in user namespace");
            }
            
			// unshare 其他需要新建的 namespace
			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
				bail("failed to unshare namespaces");

			// 创建孙进程，当前进程已经完成了 namespace 的设置，孙进程会继承这些设置
			child = clone_parent(&env, JUMP_INIT);
			if (child < 0)
				bail("unable to fork: init_func");

			// 将孙进程 PID 传给父 runc init
			s = SYNC_RECVPID_PLS;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
			}
			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: write(childpid)");
			}

			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
			}
			if (s != SYNC_RECVPID_ACK) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
			}

            // 发送 SYNC_CHILD_READY 给父 runc init
			s = SYNC_CHILD_READY;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
			}

            // 子 runc init 的工作到此结束，进程正常退出（不给 go 代码执行的机会）
			exit(0);
		}

	case JUMP_INIT:{
			// 孙 runc init 是真正启动容器 entrypoint 的进程，并且在启动之前，进行最后的环境准备工作
			enum sync_t s;

			/* We're in a child and thus need to tell the parent if we die. */
			syncfd = sync_grandchild_pipe[0];
			close(sync_grandchild_pipe[1]);
			close(sync_child_pipe[0]);
			close(sync_child_pipe[1]);

			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);

			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
			if (s != SYNC_GRANDCHILD)
				bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);

			if (setsid() < 0)
				bail("setsid failed");

			if (setuid(0) < 0)
				bail("setuid failed");

			if (setgid(0) < 0)
				bail("setgid failed");

			if (!config.is_rootless_euid && config.is_setgroup) {
				if (setgroups(0, NULL) < 0)
					bail("setgroups failed");
			}

			// 等待来自容器外 runc 的 child pipe 的关于 cgroup namespace 的消息 0x80（CREATECGROUPNS）
			if (config.cloneflags & CLONE_NEWCGROUP) {
				uint8_t value;
				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
					bail("read synchronisation value failed");
				if (value == CREATECGROUPNS) {
					if (unshare(CLONE_NEWCGROUP) < 0)
						bail("failed to unshare cgroup namespace");
				} else
					bail("received unknown synchronisation value");
			}

            // 发送孙进程准备完成的消息给祖父 runc init
			s = SYNC_CHILD_READY;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
				bail("failed to sync with patent: write(SYNC_CHILD_READY)");

			/* Close sync pipes. */
			close(sync_grandchild_pipe[0]);

			/* Free netlink data. */
			nl_free(&config);

            // 此时，父 / 祖父 runc init 都退出了（可能会有时差）
            // 但是当前进程是不能直接退出的，所以这里单纯的 return，然后开始执行 go 代码
			return;
		}
	default:
		bail("unexpected jump value");
	}

	/* Should never be reached. */
	bail("should never be reached");
}

在 namespace 初始化完成后，会通过调用链 LinuxFactory.StartInitialization() -> newContainerInit() 创建容器初始化结构 linuxStandardInit（github.com/opencontainers/runc/libcontainer/init_linux.go#47）：

func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
	var config *initConfig
    // 此处从 child pipe 中读取了 container config
	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
		return nil, err
	}
	if err := populateProcessEnvironment(config.Env); err != nil {
		return nil, err
    }
    // t 为 standard，来自于环境变量 _LIBCONTAINER_INITTYPE
	switch t {
	case initSetns:
		return &linuxSetnsInit{
			pipe:          pipe,
			consoleSocket: consoleSocket,
			config:        config,
		}, nil
	case initStandard:
		return &linuxStandardInit{
			pipe:          pipe,
			consoleSocket: consoleSocket,
			parentPid:     unix.Getppid(),
			config:        config,
			fifoFd:        fifoFd,
		}, nil
	}
	return nil, fmt.Errorf("unknown init type %q", t)
}

然后执行 linuxStandardInit.Init()（github.com/opencontainers/runc/libcontainer/standard_init_linux.go#47）：

func (l *linuxStandardInit) Init() error {
    // 这里比较重要的是这个函数，此时各个 Namespace 虽然都挂载完毕了，但是当前的进程的视角里根目录和容器外是一样的
    // 因此这个方法会挂载设备，bind mount，然后将当前根目录切换到容器的根目录下。
	if err := prepareRootfs(l.pipe, l.config); err != nil {
		return err
	}

	// 设置 root (/) 为只读
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := finalizeRootfs(l.config.Config); err != nil {
			return err
		}
	}

	// 在完成一系列容器内的环境准备之后，通过 execve 执行容器内的 entrypoint
	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
		return newSystemErrorWithCause(err, "exec user process")
	}
	return nil
}

总结：

runc init 一个会有三个进程
- 第一个进程读取 bootstrapData，并完成第二个进程的 user map 的设置
- 第二个进程完成 namespace 的设置
- 第三个进程完成 CGROUP namesapce 的设置，并读取了 0x80 的同步信息。最后进入 go 代码。go 代码读取 container config，进行容器内环境准备，最后执行容器的 entrypoint

CVE-2019-5736 过程分析

链接：https://seclists.org/oss-sec/2019/q1/119

通过构造一个恶意的容器，替换掉 runc 执行程序。runc 被再次执行时，恶意代码即可拿到 root 权限。

过程：

在 runc init 的最后一个阶段，runc 会加载容器的 entrypoint
我们伪造一个容器，它具备以下两个要素：
- entrypoint 链接到 /proc/self/exe
- 含有恶意代码的 libc.so（或者其他任意 so，只要会被 runc 加载就行）
当 runc init 最后通过 execve 启动 entrypoint 时，由于 entrypoint 指向了 /proc/self/exe，那么实际上就等于执行了 runc 自身
runc init 被替换，但是容器内的 runc 启动了，由于现在 rootfs 已经是容器的 rootfs 了，所以 so 会从容器内加载，这样就会加载到含有恶意代码的 libc.so
libc.so 的恶意代码在 constructor 里，所以一加载这个 so，这个代码就会执行。恶意代码通过 open 系统调用去只读形式打开 /proc/self/exe（只能以只读形式，因为 runc 在运行），这个时候就会有一个对应的 fd 保留下来
恶意代码这个时候通过 execve 去执行容器内的一个程序，这样不会导致 PID 发生变化，但是程序改变了，并且 fd 继续保留了下来
程序的工作就是找到 fd 编号，就在 /proc/self/fd/ 中，然后再以写的方式重新打开这个 fd（这个时候因为 runc 已经退出了，所以可以以写的方式打开）。然后写入包含恶意代码的 runc。
在下次宿主机上的 runc 再被执行时，这个恶意代码即可执行，并且拥有 runc 的权限，即 root 权限。

createDevices创建device配置

可以在容器中看到/dev

https://blog.csdn.net/zhonglinzhang/article/details/76757277

# ls
core  full    null  pts     shm     stdin   termination-log  urandom
fd    mqueue  ptmx  random  stderr  stdout  tty              zero

func createDevices(spec *specs.Spec, config *configs.Config) error {
    // add whitelisted devices
    config.Devices = []*configs.Device{
        {
            Type:     'c',
            Path:     "/dev/null",
            Major:    1,
            Minor:    3,
            FileMode: 0666,
            Uid:      0,
            Gid:      0,
        },
        {
            Type:     'c',
            Path:     "/dev/random",
            Major:    1,
            Minor:    8,
            FileMode: 0666,
            Uid:      0,
            Gid:      0,
        },
     。。。。。。
    
    }
    // merge in additional devices from the spec
    if spec.Linux != nil {
        。。。。。
            config.Devices = append(config.Devices, device)
        }
    }
    return nil
}

runc

main 函数

isRootless 判读是否不能以root权限启动
环境变量：XDG_RUNTIME_DIR ：针对每个用户的运行时目录。通常针对每个用户单独挂载一个 “tmpfs” 文件系统实例。
os.ModeSticky ，filemode，特殊标记位，文件只允许创建者和root移除
因为默认记录runc运行状态是在/run/runc目录，如果runc不能以root权限启动则不够权限，需要用到XDG_RUNTIME_DIR。

Create

checkArgs 检查参数个数
revisePidFile 修改pid-file的路径为绝对路径
setupSpec 读入配置文件，例如利用 runc spec 自动生成的 config.json
startContainer 启动容器
1. newNotifySocket 需要用到环境变量NOTIFY_SOCKET，一般为空，暂时先不分析。
2. createContainer 创建容器对象需要传入cli上下文，容器id，从配置文件生成的spec
  1. 调用 specconv 包的 CreateLibcontainerConfig ，specconv 的作用是 “Package specconv implements conversion of specifications to libcontainer”
    1. 设置容器根文件系统目录
    2. 设置标签（labels，键值对）
    3. 根据create options（从 cli 中获取或自动生成的）创建 configs.Config 对象
    4. 根据配置文件给Config对象配置mount(挂载点)
    5. createDevices 配置容器环境的白名单设备（例如：/dev/null, /dev/random）和用户指定设备
    6. createCgroupConfig 创建cgroup配置。note： “In rootless containers, any attempt to make cgroup changes will fail.” 依次处理devices，memory，cpu，pid，blockio，hugepagelimit，network。
    7. 配置 namespace，
      - NEWNET，如果newnet 并且没有配置路径，默认加入loopback（本地回环）
      - NEWUSER，配置user id 和 group id 的宿主主机和容器内部的映射。
    8. 处理配置文件的 process 字段。
    9. createHooks 配置钩子，Prestart，Poststart，Poststop