tags: container,docker,源码分析

docker 创建oci运行时配置 源码分析

1. 简介

OCI(Open Container Initiative)是一个开放的治理结构,其围绕容器格式和运行时创建了开放的行业标准。OCI于2015年6月由Docker和容器行业的其他领导者成立,目前包含两个规范:运行时规范(runtime-spec)和图像规范(image-spec)。

本文分析容器启动时,docker如何创建运行时配置(runtime-spec)。

2. 入口

这一过程发生在docker启动容器过程中,即docker container start时,前部分流程分析可以参见docker container start 源码分析

入口位于Daemon.createSpec方法,创建符合oci标准的容器配置,后续该配置将被传递给oci。

https://github.com/moby/moby/blob/master/daemon/start.go#L153-L156

func (daemon *Daemon) containerStart(container *container.Container, checkpoint string, checkpointDir string, resetRestartManager bool) (err error) {
    ...
    spec, err := daemon.createSpec(container)
    if err != nil {
        return errdefs.System(err)
    }
    ...
}

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L1008

func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
    var (
        opts []coci.SpecOpts
        s    = oci.DefaultSpec()
    )
    opts = append(opts,
        WithCommonOptions(daemon, c),
        WithCgroups(daemon, c),
        WithResources(c),
        WithSysctls(c),
        WithDevices(daemon, c),
        WithUser(c),
        WithRlimits(daemon, c),
        WithNamespaces(daemon, c),
        WithCapabilities(c),
        WithSeccomp(daemon, c),
        WithMounts(daemon, c),
        WithLibnetwork(daemon, c),
        WithApparmor(c),
        WithSelinux(c),
        WithOOMScore(&c.HostConfig.OomScoreAdj),
    )
    if c.NoNewPrivileges {
        opts = append(opts, coci.WithNoNewPrivileges)
    }

    // Set the masked and readonly paths with regard to the host config options if they are set.
    if c.HostConfig.MaskedPaths != nil {
        opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
    }
    if c.HostConfig.ReadonlyPaths != nil {
        opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
    }
    if daemon.configStore.Rootless {
        opts = append(opts, WithRootless(daemon))
    }
    return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
        ID: c.ID,
    }, &s, opts...)
}

3. 默认Linux容器oci配置

在应用其他配置前,linux容器有一些默认配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L1014

func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
    var (
        ...
        s    = oci.DefaultSpec()
    )
    ...
}

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L16

func DefaultSpec() specs.Spec {
    return DefaultOSSpec(runtime.GOOS)
}

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L25

func DefaultOSSpec(osName string) specs.Spec {
    ...
    return DefaultLinuxSpec()
}

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L39

func DefaultLinuxSpec() specs.Spec {
    ...
}

3.1 capability

容器内限制了capability, 默认仅支持以下能力集。

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L42-L48

func DefaultLinuxSpec() specs.Spec {
    s := specs.Spec{
        Version: specs.Version,
        Process: &specs.Process{
            Capabilities: &specs.LinuxCapabilities{
                Bounding:  caps.DefaultCapabilities(),
                Permitted: caps.DefaultCapabilities(),
                Effective: caps.DefaultCapabilities(),
            },
        },
        Root: &specs.Root{},
    }
    ...
}

https://github.com/moby/moby/blob/v20.10.14/oci/caps/defaults.go#L4

func DefaultCapabilities() []string {
    return []string{
        "CAP_CHOWN",
        "CAP_DAC_OVERRIDE",
        "CAP_FSETID",
        "CAP_FOWNER",
        "CAP_MKNOD",
        "CAP_NET_RAW",
        "CAP_SETGID",
        "CAP_SETUID",
        "CAP_SETFCAP",
        "CAP_SETPCAP",
        "CAP_NET_BIND_SERVICE",
        "CAP_SYS_CHROOT",
        "CAP_KILL",
        "CAP_AUDIT_WRITE",
    }
}

3.2 挂载

默认需要挂载的文件系统:

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L51-L94

func DefaultLinuxSpec() specs.Spec {
    ...
    s.Mounts = []specs.Mount{
        {
            Destination: "/proc",
            Type:        "proc",
            Source:      "proc",
            Options:     []string{"nosuid", "noexec", "nodev"},
        },
        {
            Destination: "/dev",
            Type:        "tmpfs",
            Source:      "tmpfs",
            Options:     []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
        },
        {
            Destination: "/dev/pts",
            Type:        "devpts",
            Source:      "devpts",
            Options:     []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
        },
        {
            Destination: "/sys",
            Type:        "sysfs",
            Source:      "sysfs",
            Options:     []string{"nosuid", "noexec", "nodev", "ro"},
        },
        {
            Destination: "/sys/fs/cgroup",
            Type:        "cgroup",
            Source:      "cgroup",
            Options:     []string{"ro", "nosuid", "noexec", "nodev"},
        },
        {
            Destination: "/dev/mqueue",
            Type:        "mqueue",
            Source:      "mqueue",
            Options:     []string{"nosuid", "noexec", "nodev"},
        },
        {
            Destination: "/dev/shm",
            Type:        "tmpfs",
            Source:      "shm",
            Options:     []string{"nosuid", "noexec", "nodev", "mode=1777"},
        },
    }
    ...
}

但有一些文件可能会泄漏宿主机敏感信息,所以指定到MaskedPaths成员变量,后续将会把/dev/null或tmpfs挂载至这些路径,这样原先有敏感信息的路径将被覆盖。

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L97-L108

func DefaultLinuxSpec() specs.Spec {
    ...
    s.Linux = &specs.Linux{
        MaskedPaths: []string{
            "/proc/asound",
            "/proc/acpi",
            "/proc/kcore",
            "/proc/keys",
            "/proc/latency_stats",
            "/proc/timer_list",
            "/proc/timer_stats",
            "/proc/sched_debug",
            "/proc/scsi",
            "/sys/firmware",
        },
        ...
    }
    ...
}

另有一些路径不应允许被容器内用户写入,否则可能存在安全风险。这些路径被写在ReadonlyPaths成员变量, 后续以只读形式重新挂载。

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L109-L115

func DefaultLinuxSpec() specs.Spec {
    ...
    s.Linux = &specs.Linux{
        ...
        ReadonlyPaths: []string{
            "/proc/bus",
            "/proc/fs",
            "/proc/irq",
            "/proc/sys",
            "/proc/sysrq-trigger",
        },
        ...
    }
    ...
}

3.3 namespace

默认创建以下namespace, 后续根据容器创建时指定的配置,还可能会增加其他namespace。

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L116-L122

func DefaultLinuxSpec() specs.Spec {
    ...
    s.Linux = &specs.Linux{
        ...
        Namespaces: []specs.LinuxNamespace{
            {Type: "mount"},
            {Type: "network"},
            {Type: "uts"},
            {Type: "pid"},
            {Type: "ipc"},
        },
        ...
    }
    ...
}

3.4 设备

docker默认没有添加设备。

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L127

func DefaultLinuxSpec() specs.Spec {
    ...
    s.Linux = &specs.Linux{
        ...
        // Devices implicitly contains the following devices:
        // null, zero, full, random, urandom, tty, console, and ptmx.
        // ptmx is a bind mount or symlink of the container's ptmx.
        // See also: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#default-devices
        Devices: []specs.LinuxDevice{},
        ...
    }
    ...
}

但runc有默认(即使oci配置中没有指定)会创建的设备null, zero, full, random, urandom, tty。

https://github.com/opencontainers/runc/blob/v1.1.2/libcontainer/specconv/spec_linux.go#L191

var AllowedDevices = []*devices.Device{
    ...
    {
        Path:     "/dev/null",
        FileMode: 0o666,
        Uid:      0,
        Gid:      0,
        Rule: devices.Rule{
            Type:        devices.CharDevice,
            Major:       1,
            Minor:       3,
            Permissions: "rwm",
            Allow:       true,
        },
    },
    {
        Path:     "/dev/random",
        FileMode: 0o666,
        Uid:      0,
        Gid:      0,
        Rule: devices.Rule{
            Type:        devices.CharDevice,
            Major:       1,
            Minor:       8,
            Permissions: "rwm",
            Allow:       true,
        },
    },
    {
        Path:     "/dev/full",
        FileMode: 0o666,
        Uid:      0,
        Gid:      0,
        Rule: devices.Rule{
            Type:        devices.CharDevice,
            Major:       1,
            Minor:       7,
            Permissions: "rwm",
            Allow:       true,
        },
    },
    {
        Path:     "/dev/tty",
        FileMode: 0o666,
        Uid:      0,
        Gid:      0,
        Rule: devices.Rule{
            Type:        devices.CharDevice,
            Major:       5,
            Minor:       0,
            Permissions: "rwm",
            Allow:       true,
        },
    },
    {
        Path:     "/dev/zero",
        FileMode: 0o666,
        Uid:      0,
        Gid:      0,
        Rule: devices.Rule{
            Type:        devices.CharDevice,
            Major:       1,
            Minor:       5,
            Permissions: "rwm",
            Allow:       true,
        },
    },
    {
        Path:     "/dev/urandom",
        FileMode: 0o666,
        Uid:      0,
        Gid:      0,
        Rule: devices.Rule{
            Type:        devices.CharDevice,
            Major:       1,
            Minor:       9,
            Permissions: "rwm",
            Allow:       true,
        },
    },
    ...
}

docker通过设备号,配置了各设备的cgroup访问控制策略。

Linux中设备与设备号的约定参见 Linux Kernel文档

https://github.com/moby/moby/blob/v20.10.14/oci/defaults.go#L129-L183

func DefaultLinuxSpec() specs.Spec {
    ...
        Resources: &specs.LinuxResources{
            Devices: []specs.LinuxDeviceCgroup{
                {
                    Allow:  false,
                    Access: "rwm",
                },
                {
                    Allow:  true,
                    Type:   "c",
                    Major:  iPtr(1),
                    Minor:  iPtr(5),
                    Access: "rwm",
                },
                {
                    Allow:  true,
                    Type:   "c",
                    Major:  iPtr(1),
                    Minor:  iPtr(3),
                    Access: "rwm",
                },
                {
                    Allow:  true,
                    Type:   "c",
                    Major:  iPtr(1),
                    Minor:  iPtr(9),
                    Access: "rwm",
                },
                {
                    Allow:  true,
                    Type:   "c",
                    Major:  iPtr(1),
                    Minor:  iPtr(8),
                    Access: "rwm",
                },
                {
                    Allow:  true,
                    Type:   "c",
                    Major:  iPtr(5),
                    Minor:  iPtr(0),
                    Access: "rwm",
                },
                {
                    Allow:  true,
                    Type:   "c",
                    Major:  iPtr(5),
                    Minor:  iPtr(1),
                    Access: "rwm",
                },
                {
                    Allow:  false,
                    Type:   "c",
                    Major:  iPtr(10),
                    Minor:  iPtr(229),
                    Access: "rwm",
                },
            },
        },
    }
    ...
}

4. 其他各类型配置分析

4.1 常规配置选项: WithCommonOptions

4.1.1 创建容器进程环境变量

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L757

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        linkedEnv, err := daemon.setupLinkedContainers(c)
        if err != nil {
            return err
        }
        ...
        s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
        ...
    }
}

https://github.com/moby/moby/blob/v20.10.14/container/container.go#L731

func (container *Container) CreateDaemonEnvironment(tty bool, linkedEnv []string) []string {
    ...
    env = append(env, "PATH="+system.DefaultPathEnv(os))
    env = append(env, "HOSTNAME="+container.Config.Hostname)
    if tty {
        env = append(env, "TERM=xterm")
    }
    env = append(env, linkedEnv...)
    ...
    env = ReplaceOrAppendEnvValues(env, container.Config.Env)
    return env
}

4.1.2 rootfs

设置oci配置的Root路径,形如/var/lib/docker/overlay2/{LayerID}/merged

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L721

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        s.Root = &specs.Root{
            Path:     c.BaseFS.Path(),
            Readonly: c.HostConfig.ReadonlyRootfs,
        }
        ...
    }
}

4.1.3 容器进程

创建并设置容器进程的工作目录。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L725-L731

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
            return err
        }
        cwd := c.Config.WorkingDir
        if len(cwd) == 0 {
            cwd = "/"
        }
        s.Process.Cwd = cwd
        ...
    }
}

https://github.com/moby/moby/blob/v20.10.14/container/container.go#L265

func (container *Container) SetupWorkingDirectory(rootIdentity idtools.Identity) error {
    ...
    container.Config.WorkingDir = filepath.Clean(container.Config.WorkingDir)
    pth, err := container.GetResourcePath(container.Config.WorkingDir)
    if err != nil {
        return err
    }

    if err := idtools.MkdirAllAndChownNew(pth, 0755, rootIdentity); err != nil {
        pthInfo, err2 := os.Stat(pth)
        if err2 == nil && pthInfo != nil && !pthInfo.IsDir() {
            return errors.Errorf("Cannot mkdir: %s is not a directory", container.Config.WorkingDir)
        }

        return err
    }

    return nil
}

docker可以通过init参数,设置容器的第一个进程。如果使用了该参数,且容器使用了独立的pid namespace,则将该init命令文件bind mount到容器内,并设置容器进程的参数。

默认情况下,容器的init进程为docker-init, 则容器进程的参数为docker-init -- {ENTRYPOINT}docker-init将被挂载至容器的/sbin/docker-init路径。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L732-L755

const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        s.Process.Args = append([]string{c.Path}, c.Args...)
        ...
        if c.HostConfig.PidMode.IsPrivate() {
            if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
                (c.HostConfig.Init == nil && daemon.configStore.Init) {
                s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
                path := daemon.configStore.InitPath
                if path == "" {
                    path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
                    if err != nil {
                        return err
                    }
                }
                s.Mounts = append(s.Mounts, specs.Mount{
                    Destination: inContainerInitPath,
                    Type:        "bind",
                    Source:      path,
                    Options:     []string{"bind", "ro"},
                })
            }
        }
        ...
    }
}

容器创建时可以指定--tty参数,该参数被传递给s.Process.Terminal

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L758

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        s.Process.Terminal = c.Config.Tty
        ...
    }
}

4.1.4 hostname, domainname

直接传递hostname到oci配置:

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L760

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        s.Hostname = c.Config.Hostname
        ...
    }
}

设置Domainname,因为oci中没有该配置项,所以通过sysctl传递

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L761

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        setLinuxDomainname(c, s)
        ...
    }
}

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_utils.go#L8

func setLinuxDomainname(c *container.Container, s *specs.Spec) {
	// There isn't a field in the OCI for the NIS domainname, but luckily there
	// is a sysctl which has an identical effect to setdomainname(2) so there's
	// no explicit need for runtime support.
	s.Linux.Sysctl = make(map[string]string)
	if c.Config.Domainname != "" {
		s.Linux.Sysctl["kernel.domainname"] = c.Config.Domainname
	}
}

4.1.5 非特权ICMP echo和端口监听

通过sysctl设置ping范围、允许普通用户监听特权端口。该功能原本通过CAP_NET_RAWCAP_NET_BIND_SERVICE提供,引入此段代码后,如果容器使用独立的network namespace, 在docker中可默认生效。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L768-L779

func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        if c.HostConfig.NetworkMode.IsPrivate() {
            // We cannot set up ping socket support in a user namespace
            userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
            if !userNS && !sys.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
                // allow unprivileged ICMP echo sockets without CAP_NET_RAW
                s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
            }
            // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
            if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
                s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
            }
        }

        return nil
    }
}

4.2 cgroups配置: WithCgroups

传递cgroup路径到oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L786

func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        var cgroupsPath string
        scopePrefix := "docker"
        parent := "/docker"
        useSystemd := UsingSystemd(daemon.configStore)
        if useSystemd {
            parent = "system.slice"
            if daemon.configStore.Rootless {
                parent = "user.slice"
            }
        }

        if c.HostConfig.CgroupParent != "" {
            parent = c.HostConfig.CgroupParent
        } else if daemon.configStore.CgroupParent != "" {
            parent = daemon.configStore.CgroupParent
        }

        if useSystemd {
            cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
            logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
        } else {
            cgroupsPath = filepath.Join(parent, c.ID)
        }
        s.Linux.CgroupsPath = cgroupsPath

        // the rest is only needed for CPU RT controller

        if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
            return nil
        }
        ...
        if err := daemon.initCPURtController(mnt, parentPath); err != nil {
            return errors.Wrap(err, "unable to init CPU RT controller")
        }
        return nil
    }
}

4.3 资源限制: WithResources

从HostConfig读取资源限制规则,传递给oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L936

func WithResources(c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        r := c.HostConfig.Resources
        weightDevices, err := getBlkioWeightDevices(r)
        if err != nil {
            return err
        }
        readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
        if err != nil {
            return err
        }
        writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
        if err != nil {
            return err
        }
        readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
        if err != nil {
            return err
        }
        writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
        if err != nil {
            return err
        }

        memoryRes := getMemoryResources(r)
        cpuRes, err := getCPUResources(r)
        if err != nil {
            return err
        }
        blkioWeight := r.BlkioWeight

        specResources := &specs.LinuxResources{
            Memory: memoryRes,
            CPU:    cpuRes,
            BlockIO: &specs.LinuxBlockIO{
                Weight:                  &blkioWeight,
                WeightDevice:            weightDevices,
                ThrottleReadBpsDevice:   readBpsDevice,
                ThrottleWriteBpsDevice:  writeBpsDevice,
                ThrottleReadIOPSDevice:  readIOpsDevice,
                ThrottleWriteIOPSDevice: writeIOpsDevice,
            },
            Pids: getPidsLimit(r),
        }

        if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
            specResources.Devices = s.Linux.Resources.Devices
        }

        s.Linux.Resources = specResources
        return nil
    }
}

4.4 WithSysctls

前面已经设置了一些sysctl配置,WithSysctls函数将HostConfig中用户额外指定的sysctl配置传递给oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L991

func WithSysctls(c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        for k, v := range c.HostConfig.Sysctls {
            s.Linux.Sysctl[k] = v
        }
        return nil
    }
}

4.5 设备:WithDevices

用于处理用户创建容器时指定的设备相关参数,具体包括以下参数:

  • --device: 添加宿主机设备到容器
  • --device-cgroup-rule: 添加设备的cgroup规则
  • --gpus: 向容器添加gpu设备,对应HostConfig.DeviceRequests

如果是特权容器,且未运行在userns下,添加所有宿主机设备,且允许对所有设备的所有权限。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L873-L906

func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        // Build lists of devices allowed and created within the container.
        var devs []specs.LinuxDevice
        devPermissions := s.Linux.Resources.Devices

        if c.HostConfig.Privileged && !sys.RunningInUserNS() {
            hostDevices, err := devices.HostDevices()
            if err != nil {
                return err
            }
            for _, d := range hostDevices {
                devs = append(devs, oci.Device(d))
            }

            // adding device mappings in privileged containers
            for _, deviceMapping := range c.HostConfig.Devices {
                ...
                // issue a warning that the device path already exists via /dev mounting in privileged mode
                if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
                    logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
                    continue
                }
                d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
                if err != nil {
                    return err
                }
                devs = append(devs, d...)
            }

            devPermissions = []specs.LinuxDeviceCgroup{
                {
                    Allow:  true,
                    Access: "rwm",
                },
            }
        }
        ...
    }
}

非特权容器或userns模式,需要设置设备和cgroup权限。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L906-L921

func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        if c.HostConfig.Privileged && !sys.RunningInUserNS() {
            ...
        } else {
            for _, deviceMapping := range c.HostConfig.Devices {
                d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
                if err != nil {
                    return err
                }
                devs = append(devs, d...)
                devPermissions = append(devPermissions, dPermissions...)
            }

            var err error
            devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
            if err != nil {
                return err
            }
        }
        ...
    }
}

将设置好的设备及对应的cgroup规则传递至oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L923-L924

func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        s.Linux.Devices = append(s.Linux.Devices, devs...)
        s.Linux.Resources.Devices = devPermissions
        ...
    }
}

处理特殊设备。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L926-L931

func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        for _, req := range c.HostConfig.DeviceRequests {
            if err := daemon.handleDevice(req, s); err != nil {
                return err
            }
        }
        return nil
    }
}

所谓“特殊设备”, 当前实际仅支持nvidia gpu。

https://github.com/docker/cli/blob/master/cli/command/container/opts.go#L593

func parse(flags *pflag.FlagSet, copts *containerOptions, serverOS string) (*containerConfig, error) {
    ...
    resources := container.Resources{
        ...
        DeviceRequests:       copts.gpus.Value(),
    }
    ...
    hostConfig := &container.HostConfig{
        ...
        Resources:      resources,
        ...
    }
    ...
}

当前仅有nvidia注册成为了DeviceDriver, 所以说仅支持nvidia gpu。

https://github.com/moby/moby/blob/v20.10.14/daemon/nvidia_linux.go#L32

func init() {
    if _, err := exec.LookPath(nvidiaHook); err != nil {
        // do not register Nvidia driver if helper binary is not present.
        return
    }
    capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}}
    nvidiaDriver := &deviceDriver{
        capset:     capset,
        updateSpec: setNvidiaGPUs,
    }
    for c := range allNvidiaCaps {
        nvidiaDriver.capset[string(c)] = struct{}{}
    }
    registerDeviceDriver("nvidia", nvidiaDriver)
}

https://github.com/moby/moby/blob/v20.10.14/daemon/devices_linux.go#L22

var deviceDrivers = map[string]*deviceDriver{}

type deviceDriver struct {
    capset     capabilities.Set
    updateSpec func(*specs.Spec, *deviceInstance) error
}

...

func registerDeviceDriver(name string, d *deviceDriver) {
    deviceDrivers[name] = d
}

处理特殊设备即调用设备驱动更新oci配置。这是一种后向兼容设计,目前如果用户指定了gpus参数,则nvidia driver的updateSpec方法会被调用。

https://github.com/moby/moby/blob/v20.10.14/daemon/devices_linux.go#L34

func (daemon *Daemon) handleDevice(req container.DeviceRequest, spec *specs.Spec) error {
    if req.Driver == "" {
        for _, dd := range deviceDrivers {
            if selected := dd.capset.Match(req.Capabilities); selected != nil {
                return dd.updateSpec(spec, &deviceInstance{req: req, selectedCaps: selected})
            }
        }
    } else if dd := deviceDrivers[req.Driver]; dd != nil {
        if selected := dd.capset.Match(req.Capabilities); selected != nil {
            return dd.updateSpec(spec, &deviceInstance{req: req, selectedCaps: selected})
        }
    }
    return incompatibleDeviceRequest{req.Driver, req.Capabilities}
}

配置进程环境变量,并在runtime的Prestart阶段,将nvidia-container-runtime-hook prestart命令设置为Hook进程

https://github.com/moby/moby/blob/v20.10.14/daemon/nvidia_linux.go#L48

func setNvidiaGPUs(s *specs.Spec, dev *deviceInstance) error {
    req := dev.req
    if req.Count != 0 && len(req.DeviceIDs) > 0 {
        return errConflictCountDeviceIDs
    }

    if len(req.DeviceIDs) > 0 {
        s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+strings.Join(req.DeviceIDs, ","))
    } else if req.Count > 0 {
        s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+countToDevices(req.Count))
    } else if req.Count < 0 {
        s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES=all")
    }

    var nvidiaCaps []string
    // req.Capabilities contains device capabilities, some but not all are NVIDIA driver capabilities.
    for _, c := range dev.selectedCaps {
        nvcap := nvidia.Capability(c)
        if _, isNvidiaCap := allNvidiaCaps[nvcap]; isNvidiaCap {
            nvidiaCaps = append(nvidiaCaps, c)
            continue
        }
        // TODO: nvidia.WithRequiredCUDAVersion
        // for now we let the prestart hook verify cuda versions but errors are not pretty.
    }

    if nvidiaCaps != nil {
        s.Process.Env = append(s.Process.Env, "NVIDIA_DRIVER_CAPABILITIES="+strings.Join(nvidiaCaps, ","))
    }

    path, err := exec.LookPath(nvidiaHook)
    if err != nil {
        return err
    }

    if s.Hooks == nil {
        s.Hooks = &specs.Hooks{}
    }
    s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
        Path: path,
        Args: []string{
            nvidiaHook,
            "prestart",
        },
        Env: os.Environ(),
    })

    return nil
}

4.6 进程运行用户: WithUser

根据用户指定的用户,从容器的/etc/passwd,/etc/group文件中读取对应的信息,传递至oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L1003

func WithUser(c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        var err error
        s.Process.User, err = getUser(c, c.Config.User)
        return err
    }
}

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L182

func getUser(c *container.Container, username string) (specs.User, error) {
    var usr specs.User
    passwdPath, err := resourcePath(c, user.GetPasswdPath)
    if err != nil {
        return usr, err
    }
    groupPath, err := resourcePath(c, user.GetGroupPath)
    if err != nil {
        return usr, err
    }
    execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
    if err != nil {
        return usr, err
    }
    usr.UID = uint32(execUser.Uid)
    usr.GID = uint32(execUser.Gid)

    var addGroups []int
    if len(c.HostConfig.GroupAdd) > 0 {
        addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
        if err != nil {
            return usr, err
        }
    }
    for _, g := range append(execUser.Sgids, addGroups...) {
        usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
    }
    return usr, nil
}

4.7 WithRlimits

合并容器和dockerd的rlimits配置,传递之oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L42

func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        var rlimits []specs.POSIXRlimit

        // We want to leave the original HostConfig alone so make a copy here
        hostConfig := *c.HostConfig
        // Merge with the daemon defaults
        daemon.mergeUlimits(&hostConfig)
        for _, ul := range hostConfig.Ulimits {
            rlimits = append(rlimits, specs.POSIXRlimit{
                Type: "RLIMIT_" + strings.ToUpper(ul.Name),
                Soft: uint64(ul.Soft),
                Hard: uint64(ul.Hard),
            })
        }

        s.Process.Rlimits = rlimits
        return nil
    }
}

4.8 WithNamespaces

DefaultLinuxSpec()函数中,已经设置了默认的namespace。在WithNamespaces()函数中将根据用户配置进行调整。


设置oci配置中user namespace配置:

https://github.com/moby/moby/blob/master/daemon/oci_linux.go#L230-L239

func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        userNS := false
        // user
        if c.HostConfig.UsernsMode.IsPrivate() {
            uidMap := daemon.idMapping.UIDs()
            if uidMap != nil {
                userNS = true
                ns := specs.LinuxNamespace{Type: "user"}
                setNamespace(s, ns)
                s.Linux.UIDMappings = specMapping(uidMap)
                s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
            }
        }
        ...
    }
}

  • 如果使用其他容器的network namespace, 将pid namespace的路径设置为该容器进程的ipc ns路径
  • 如果使用宿主机的network namespace, 将network namespace的路径设置为/var/run/docker/netns/default

https://github.com/moby/moby/blob/master/daemon/oci_linux.go#L241-L260

func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        // network
        if !c.Config.NetworkDisabled {
            ns := specs.LinuxNamespace{Type: "network"}
            parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
            if parts[0] == "container" {
                nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
                if err != nil {
                    return err
                }
                ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
                if userNS {
                    // to share a net namespace, they must also share a user namespace
                    nsUser := specs.LinuxNamespace{Type: "user"}
                    nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
                    setNamespace(s, nsUser)
                }
            } else if c.HostConfig.NetworkMode.IsHost() {
                ns.Path = c.NetworkSettings.SandboxKey
            }
            setNamespace(s, ns)
        }
        ...
    }
}

  • 如果使用其他容器的ipc namespace, 将pid namespace的路径设置为该容器进程的ipc ns路径
  • 如果使用素主机的ipc namespace,则从oci配置中移除ipc namespace
  • 如果使用独立的ipc namespace,则将ipc namespace的Path变量置空

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L260-L287

func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        // ipc
        ipcMode := c.HostConfig.IpcMode
        switch {
        case ipcMode.IsContainer():
            ns := specs.LinuxNamespace{Type: "ipc"}
            ic, err := daemon.getIpcContainer(ipcMode.Container())
            if err != nil {
                return err
            }
            ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
            setNamespace(s, ns)
            if userNS {
                // to share an IPC namespace, they must also share a user namespace
                nsUser := specs.LinuxNamespace{Type: "user"}
                nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
                setNamespace(s, nsUser)
            }
        case ipcMode.IsHost():
            oci.RemoveNamespace(s, "ipc")
        case ipcMode.IsEmpty():
            // A container was created by an older version of the daemon.
            // The default behavior used to be what is now called "shareable".
            fallthrough
        case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
            ns := specs.LinuxNamespace{Type: "ipc"}
            setNamespace(s, ns)
        default:
            return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
        }
    }
}

  • 如果使用其他容器的pid namespace, 将pid namespace的路径设置为该容器进程pid ns路径
  • 如果使用宿主机的pid namespace, 则从oci配置中移除pid namespace
  • 如果使用独立的pid namespace,则将pid namespace的Path变量置空

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L290-L313

func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...        
        // pid
        if c.HostConfig.PidMode.IsContainer() {
            pc, err := daemon.getPidContainer(c)
            if err != nil {
                return err
            }
            ns := specs.LinuxNamespace{
                Type: "pid",
                Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
            }
            setNamespace(s, ns)
            if userNS {
                // to share a PID namespace, they must also share a user namespace
                nsUser := specs.LinuxNamespace{
                    Type: "user",
                    Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
                }
                setNamespace(s, nsUser)
            }
        } else if c.HostConfig.PidMode.IsHost() {
            oci.RemoveNamespace(s, "pid")
        } else {
            ns := specs.LinuxNamespace{Type: "pid"}
            setNamespace(s, ns)
        }
        ...
    }
}

如果使用宿主机的UTS namespace,则从之前DefaultLinuxSpec()函数配置的namespace中移除uts namespace。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L315-L318

func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        // uts
        if c.HostConfig.UTSMode.IsHost() {
            oci.RemoveNamespace(s, "uts")
            s.Hostname = ""
        }
        ...
    }
}

设置cgroup namespace。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L321-L330

func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        // cgroup
        if !c.HostConfig.CgroupnsMode.IsEmpty() {
            cgroupNsMode := c.HostConfig.CgroupnsMode
            ...
            if cgroupNsMode.IsPrivate() {
                nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
                setNamespace(s, nsCgroup)
            }
        }

        return nil
    }
}

默认情况使用宿主机的cgroup namespace:

https://github.com/moby/moby/blob/v20.10.14/api/server/router/container/container_routes.go#L497-L502

func (s *containerRouter) postContainersCreate(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
    ...
    if hostConfig != nil && versions.LessThan(version, "1.41") && !s.cgroup2 {
        // Older clients expect the default to be "host" on cgroup v1 hosts
        if hostConfig.CgroupnsMode.IsEmpty() {
            hostConfig.CgroupnsMode = container.CgroupnsMode("host")
        }
    }
    ...
}

cgroup v2默认使用独立的cgroup namespace。

func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConfig, adjustCPUShares bool) error {
    ...
    if hostConfig.CgroupnsMode.IsEmpty() {
        // for cgroup v2: unshare cgroupns even for privileged containers
        // https://github.com/containers/libpod/pull/4374#issuecomment-549776387
        if hostConfig.Privileged && cgroups.Mode() != cgroups.Unified {
            hostConfig.CgroupnsMode = containertypes.CgroupnsMode("host")
        } else {
            m := "host"
            if cgroups.Mode() == cgroups.Unified {
                m = "private"
            }
            if daemon.configStore != nil {
                m = daemon.configStore.CgroupNamespaceMode
            }
            hostConfig.CgroupnsMode = containertypes.CgroupnsMode(m)
        }
    }
    ...
}


4.9 WithCapabilities

整合用户的设置和默认capabilities, 更新oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L159

func WithCapabilities(c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        capabilities, err := caps.TweakCapabilities(
            caps.DefaultCapabilities(),
            c.HostConfig.CapAdd,
            c.HostConfig.CapDrop,
            c.HostConfig.Privileged,
        )
        if err != nil {
            return err
        }
        return oci.SetCapabilities(s, capabilities)
    }
}

如果容器进程不是0, 仅设置Bounding,其他属性置空。

https://github.com/moby/moby/blob/v20.10.14/oci/oci.go#L21

func SetCapabilities(s *specs.Spec, caplist []string) error {
    // setUser has already been executed here
    if s.Process.User.UID == 0 {
        s.Process.Capabilities = &specs.LinuxCapabilities{
            Effective: caplist,
            Bounding:  caplist,
            Permitted: caplist,
        }
    } else {
        // Do not set Effective and Permitted capabilities for non-root users,
        // to match what execve does.
        s.Process.Capabilities = &specs.LinuxCapabilities{
            Bounding: caplist,
        }
    }
    return nil
}

4.10 将docker seccomp规则转换成oci格式:WithSeccomp

docker将seccomp规则与capability进行了结合,大大方便了seccomp的使用,在WithSeccomp函数中,将其转换成为了oci配置的格式。


用户将seccomp配置为unconfined,或容器为特权容器,或内核不支持seccomp时,不设置oci的seccomp配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/seccomp_linux.go#L21-L34

func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        if c.SeccompProfile == "unconfined" {
            return nil
        }
        if c.HostConfig.Privileged {
            return nil
        }
        if !daemon.seccompEnabled {
            if c.SeccompProfile != "" {
                return fmt.Errorf("seccomp is not enabled in your kernel, cannot run a custom seccomp profile")
            }
            logrus.Warn("seccomp is not enabled in your kernel, running container without default profile")
            c.SeccompProfile = "unconfined"
            return nil
        }
        ...
    }
}

依次以用户指定配置、daemon配置、默认规则顺序判断加载何种seccomp规则。

https://github.com/moby/moby/blob/v20.10.14/daemon/seccomp_linux.go#L35-L44

func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        ...
        var err error
        switch {
        case c.SeccompProfile != "":
            s.Linux.Seccomp, err = seccomp.LoadProfile(c.SeccompProfile, s)
        case daemon.seccompProfile != nil:
            s.Linux.Seccomp, err = seccomp.LoadProfile(string(daemon.seccompProfile), s)
        default:
            s.Linux.Seccomp, err = seccomp.GetDefaultProfile(s)
        }
        return err
    }
}

加载seccomp规则,并将其转换为oci格式。

https://github.com/moby/moby/blob/master/profiles/seccomp/seccomp_linux.go#L20

func LoadProfile(body string, rs *specs.Spec) (*specs.LinuxSeccomp, error) {
    var config Seccomp
    if err := json.Unmarshal([]byte(body), &config); err != nil {
        return nil, fmt.Errorf("Decoding seccomp profile failed: %v", err)
    }
    return setupSeccomp(&config, rs)
}

设置对系统调用的默认行为,docker的seccomp默认规则中,该值为SCMP_ACT_ERRNO, 作用是导致Permission Denied错误。

https://github.com/moby/moby/blob/v20.10.14/profiles/seccomp/seccomp_linux.go#L109

func setupSeccomp(config *Seccomp, rs *specs.Spec) (*specs.LinuxSeccomp, error) {
    ...
    newConfig.DefaultAction = config.DefaultAction
    ...
}

这项配置在runc中用来传递给C的seccomp_init函数。

https://github.com/opencontainers/runc/blob/v1.0.3/libcontainer/seccomp/seccomp_linux.go#L44

func InitSeccomp(config *configs.Seccomp) error {
    ...
    filter, err := libseccomp.NewFilter(defaultAction)
    ...
}

https://github.com/opencontainers/runc/blob/v1.0.3/vendor/github.com/seccomp/libseccomp-golang/seccomp.go#L512

func NewFilter(defaultAction ScmpAction) (*ScmpFilter, error) {
    ...
    fPtr := C.seccomp_init(defaultAction.toNative())
    ...
}

docker的seccomp配置中,每条规则结构如下。其中includes, excludes参数定义了过滤规则。

https://github.com/moby/moby/blob/v20.10.14/profiles/seccomp/seccomp.go#L44

type Syscall struct {
    Name     string                   `json:"name,omitempty"`
    Names    []string                 `json:"names,omitempty"`
    Action   specs.LinuxSeccompAction `json:"action"`
    ErrnoRet *uint                    `json:"errnoRet,omitempty"`
    Args     []*specs.LinuxSeccompArg `json:"args"`
    Comment  string                   `json:"comment"`
    Includes Filter                   `json:"includes"`
    Excludes Filter                   `json:"excludes"`
}

如果容器的capability中包含存在于Excludes变量中的任一capability, 或者docker软件编译时的架构存在于Excludes变量中的arches列表,或容器运行时的kernel版本高于或等于Excludes变量中的MinKernel参数,则跳过该条seccomp规则,则跳过该条seccomp规则;

如果docker软件编译时的架构不存在于Includes变量中的Arches列表中,或容器运行时的capability中不包含存在于Includes变量中的任一capability,或容器运行时的kernel版本低于Includes变量中的MinKernel参数,则跳过该条seccomp规则。

https://github.com/moby/moby/blob/v20.10.14/profiles/seccomp/seccomp_linux.go#L113-L151

func setupSeccomp(config *Seccomp, rs *specs.Spec) (*specs.LinuxSeccomp, error) {
    ...
    Loop:
    // Loop through all syscall blocks and convert them to libcontainer format after filtering them
    for _, call := range config.Syscalls {
        if len(call.Excludes.Arches) > 0 {
            if inSlice(call.Excludes.Arches, arch) {
                continue Loop
            }
        }
        if len(call.Excludes.Caps) > 0 {
            for _, c := range call.Excludes.Caps {
                if inSlice(rs.Process.Capabilities.Bounding, c) {
                    continue Loop
                }
            }
        }
        if call.Excludes.MinKernel != nil {
            if ok, err := kernelGreaterEqualThan(*call.Excludes.MinKernel); err != nil {
                return nil, err
            } else if ok {
                continue Loop
            }
        }
        if len(call.Includes.Arches) > 0 {
            if !inSlice(call.Includes.Arches, arch) {
                continue Loop
            }
        }
        if len(call.Includes.Caps) > 0 {
            for _, c := range call.Includes.Caps {
                if !inSlice(rs.Process.Capabilities.Bounding, c) {
                    continue Loop
                }
            }
        }
        if call.Includes.MinKernel != nil {
            if ok, err := kernelGreaterEqualThan(*call.Includes.MinKernel); err != nil {
                return nil, err
            } else if !ok {
                continue Loop
            }
        }
        ...
    }
    ...
}

现在将docker软件的seccomp配置的每个syscall规则,转换成oci配置的格式,结构和转换过程如下:

https://github.com/opencontainers/runc/blob/v1.0.3/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go#L679

type LinuxSyscall struct {
	Names    []string           `json:"names"`
	Action   LinuxSeccompAction `json:"action"`
	ErrnoRet *uint              `json:"errnoRet,omitempty"`
	Args     []LinuxSeccompArg  `json:"args,omitempty"`
}

https://github.com/moby/moby/blob/v20.10.14/profiles/seccomp/seccomp_linux.go#L153-L174

func setupSeccomp(config *Seccomp, rs *specs.Spec) (*specs.LinuxSeccomp, error) {
    ...
    Loop:
    // Loop through all syscall blocks and convert them to libcontainer format after filtering them
    for _, call := range config.Syscalls {
        ...
        newCall := specs.LinuxSyscall{
            Action:   call.Action,
            ErrnoRet: call.ErrnoRet,
        }
        if call.Name != "" && len(call.Names) != 0 {
            return nil, errors.New("'name' and 'names' were specified in the seccomp profile, use either 'name' or 'names'")
        }
        if call.Name != "" {
            newCall.Names = []string{call.Name}
        } else {
            newCall.Names = call.Names
        }
        // Loop through all the arguments of the syscall and convert them
        for _, arg := range call.Args {
            newCall.Args = append(newCall.Args, *arg)
        }

        newConfig.Syscalls = append(newConfig.Syscalls, newCall)
    }

    return newConfig, nil
}

4.11 WithMounts

4.11.1 创建工作目录:mounts

在容器的工作目录创建mounts目录(形如/var/lib/docker/containers/{CONTAINER_ID}/mounts),后续部分文件会先挂载到该目录,再挂载到容器rootfs。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L486-L488

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        if err := daemon.setupContainerMountsRoot(c); err != nil {
            return err
        }
        ...
    }
}

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L455

func (daemon *Daemon) setupContainerMountsRoot(c *container.Container) error {
	// get the root mount path so we can make it unbindable
	p, err := c.MountsResourcePath("")
	if err != nil {
		return err
	}
	return idtools.MkdirAllAndChown(p, 0710, idtools.Identity{UID: idtools.CurrentIdentity().UID, GID: daemon.IdentityMapping().RootPair().GID})
}

4.11.2 创建工作目录:shm

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L490-L492

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        if err := daemon.setupIpcDirs(c); err != nil {
            return err
        }
        ...
    }
}

docker在创建容器时,可以通过--ipc参数,指定容器使用私有或共享的ipc。

  • 如果共享其他容器或宿主机的IPC,在setupIpcDirs函数中,将容器的ShmPath属性更新为要共享的路径。
  • 如果使用私有的IPC或不使用IPC,则将ShmPath置空。
  • 如果IPC模式为shareable,则可以向其他容器共享IPC。创建shm工作目录,路径形如/var/lib/docker/containers/{CONTAINER_ID}/mounts/shm,然后将shm挂载至该路径,并将容器的ShmPath属性设置为该路径。后续该路径将以bind模式被挂载至容器rootfs。

设置ShmPath属性,是为了其他容器在试图共享该容器的IPC时可以得知shm路径。

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L106

func (daemon *Daemon) setupIpcDirs(c *container.Container) error {
    ipcMode := c.HostConfig.IpcMode

    switch {
    case ipcMode.IsContainer():
        ic, err := daemon.getIpcContainer(ipcMode.Container())
        if err != nil {
            return err
        }
        c.ShmPath = ic.ShmPath

    case ipcMode.IsHost():
        if _, err := os.Stat("/dev/shm"); err != nil {
            return fmt.Errorf("/dev/shm is not mounted, but must be for --ipc=host")
        }
        c.ShmPath = "/dev/shm"

    case ipcMode.IsPrivate(), ipcMode.IsNone():
        // c.ShmPath will/should not be used, so make it empty.
        // Container's /dev/shm mount comes from OCI spec.
        c.ShmPath = ""

    case ipcMode.IsEmpty():
        // A container was created by an older version of the daemon.
        // The default behavior used to be what is now called "shareable".
        fallthrough

    case ipcMode.IsShareable():
        rootIDs := daemon.idMapping.RootPair()
        if !c.HasMountFor("/dev/shm") {
            shmPath, err := c.ShmResourcePath()
            if err != nil {
                return err
            }

            if err := idtools.MkdirAllAndChown(shmPath, 0700, rootIDs); err != nil {
                return err
            }

            shmproperty := "mode=1777,size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
            if err := unix.Mount("shm", shmPath, "tmpfs", uintptr(unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV), label.FormatMountLabel(shmproperty, c.GetMountLabel())); err != nil {
                return fmt.Errorf("mounting shm tmpfs: %s", err)
            }
            if err := os.Chown(shmPath, rootIDs.UID, rootIDs.GID); err != nil {
                return err
            }
            c.ShmPath = shmPath
        }

    default:
        return fmt.Errorf("invalid IPC mode: %v", ipcMode)
    }

    return nil
}

4.11.3 设置工作目录:secret

secret是swarm集群的功能,它将以文件形式被挂载至容器rootfs内。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L494-L502

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        defer func() {
            if err != nil {
                daemon.cleanupSecretDir(c)
            }
        }()

        if err := daemon.setupSecretDir(c); err != nil {
            return err
        }
        ...
    }
}

创建secret工作目录,路径形如/var/lib/docker/containers/{CONTAINER_ID}/mounts/secrets,并将tmpfs挂载至该路径。

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L163-L174

func (daemon *Daemon) setupSecretDir(c *container.Container) (setupErr error) {
    if len(c.SecretReferences) == 0 && len(c.ConfigReferences) == 0 {
        return nil
    }

    if err := daemon.createSecretsDir(c); err != nil {
        return err
    }
    defer func() {
        if setupErr != nil {
            daemon.cleanupSecretDir(c)
        }
    }()
    ...
}

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L285

func (daemon *Daemon) createSecretsDir(c *container.Container) error {
    // retrieve possible remapped range start for root UID, GID
    rootIDs := daemon.idMapping.RootPair()
    dir, err := c.SecretMountPath()
    if err != nil {
        return errors.Wrap(err, "error getting container secrets dir")
    }

    // create tmpfs
    if err := idtools.MkdirAllAndChown(dir, 0700, rootIDs); err != nil {
        return errors.Wrap(err, "error creating secret local mount path")
    }

    tmpfsOwnership := fmt.Sprintf("uid=%d,gid=%d", rootIDs.UID, rootIDs.GID)
    if err := mount.Mount("tmpfs", dir, "tmpfs", "nodev,nosuid,noexec,"+tmpfsOwnership); err != nil {
        return errors.Wrap(err, "unable to setup secret mount")
    }
    return nil
}

将secret内容写入到/var/lib/docker/containers/{CONTAINER_ID}/mounts/secrets/{SECRET_ID}

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L183-L227

func (daemon *Daemon) setupSecretDir(c *container.Container) (setupErr error) {
    ...
    for _, s := range c.SecretReferences {
        ...
        fPath, err := c.SecretFilePath(*s)
        ...
        secret, err := c.DependencyStore.Secrets().Get(s.SecretID)
        ...
        if err := ioutil.WriteFile(fPath, secret.Spec.Data, s.File.Mode); err != nil {
        ...
        if err := os.Chown(fPath, rootIDs.UID+uid, rootIDs.GID+gid); err != nil {
            return errors.Wrap(err, "error setting ownership for secret")
        }
        if err := os.Chmod(fPath, s.File.Mode); err != nil {
            return errors.Wrap(err, "error setting file mode for secret")
        }
    }
    ...
}

docker swarm另外还支持通过--config参数向容器注入config,因为其支持go语言的模板引擎,有可能在config中引入secret, 因此,config文件也将写入secrets工作目录,具体路径形如/var/lib/docker/containers/{CONTAINER_ID}/mounts/secrets/{CONFIG_ID}

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L229-L278

func (daemon *Daemon) setupSecretDir(c *container.Container) (setupErr error) {
    ...
    for _, configRef := range c.ConfigReferences {
        ...
        fPath, err := c.ConfigFilePath(*configRef)
        ...
        if err := idtools.MkdirAllAndChown(filepath.Dir(fPath), 0700, rootIDs); err != nil {
        ...
        config, err := c.DependencyStore.Configs().Get(configRef.ConfigID)
        ...
        if err := ioutil.WriteFile(fPath, config.Spec.Data, configRef.File.Mode); err != nil {
            return errors.Wrap(err, "error injecting config")
        }

        uid, err := strconv.Atoi(configRef.File.UID)
        if err != nil {
            return err
        }
        gid, err := strconv.Atoi(configRef.File.GID)
        if err != nil {
            return err
        }

        if err := os.Chown(fPath, rootIDs.UID+uid, rootIDs.GID+gid); err != nil {
            return errors.Wrap(err, "error setting ownership for config")
        }
        if err := os.Chmod(fPath, configRef.File.Mode); err != nil {
            return errors.Wrap(err, "error setting file mode for config")
        }
    }
    ...
}

secret文件写入完毕,以只读方式重新挂载secrets工作目录。

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L280

func (daemon *Daemon) setupSecretDir(c *container.Container) (setupErr error) {
    ...
    return daemon.remountSecretDir(c)
}

https://github.com/moby/moby/blob/v20.10.14/daemon/container_operations_unix.go#L305

func (daemon *Daemon) remountSecretDir(c *container.Container) error {
    dir, err := c.SecretMountPath()
    if err != nil {
        return errors.Wrap(err, "error getting container secrets path")
    }
    if err := label.Relabel(dir, c.MountLabel, false); err != nil {
        logrus.WithError(err).WithField("dir", dir).Warn("Error while attempting to set selinux label")
    }
    rootIDs := daemon.idMapping.RootPair()
    tmpfsOwnership := fmt.Sprintf("uid=%d,gid=%d", rootIDs.UID, rootIDs.GID)

    // remount secrets ro
    if err := mount.Mount("tmpfs", dir, "tmpfs", "remount,ro,"+tmpfsOwnership); err != nil {
        return errors.Wrap(err, "unable to remount dir as readonly")
    }

    return nil
}

4.11.4 setupMounts

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L504-L507

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        ms, err := daemon.setupMounts(c)
        if err != nil {
            return err
        }
        ...
    }
}

遍历容器的挂载点,对每个挂载点调用MountPoint.Setup()方法。跳过tmpfs类型的挂载点,其中tmpfs从容器的HostConfig和MountPoints中收集而来。

https://github.com/moby/moby/blob/v20.10.14/daemon/volumes_unix.go#L25-L36

func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
    var mounts []container.Mount
    // TODO: tmpfs mounts should be part of Mountpoints
    tmpfsMounts := make(map[string]bool)
    tmpfsMountInfo, err := c.TmpfsMounts()
    if err != nil {
        return nil, err
    }
    for _, m := range tmpfsMountInfo {
        tmpfsMounts[m.Destination] = true
    }
    for _, m := range c.MountPoints {
        if tmpfsMounts[m.Destination] {
            continue
        }
        ...
        path, err := m.Setup(c.MountLabel, daemon.idMapping.RootPair(), checkfunc)
        ...
    }
    ...
}

https://github.com/moby/moby/blob/v20.10.14/container/container_unix.go#L421

func (container *Container) TmpfsMounts() ([]Mount, error) {
    parser := volumemounts.NewParser(container.OS)
    var mounts []Mount
    for dest, data := range container.HostConfig.Tmpfs {
        mounts = append(mounts, Mount{
            Source:      "tmpfs",
            Destination: dest,
            Data:        data,
        })
    }
    for dest, mnt := range container.MountPoints {
        if mnt.Type == mounttypes.TypeTmpfs {
            data, err := parser.ConvertTmpfsOptions(mnt.Spec.TmpfsOptions, mnt.Spec.ReadOnly)
            if err != nil {
                return nil, err
            }
            mounts = append(mounts, Mount{
                Source:      "tmpfs",
                Destination: dest,
                Data:        data,
            })
        }
    }
    return mounts, nil
}

MountPoint.Setup()方法用于准备挂载点的源路径。

如果挂载点是卷,则根据卷类型各自的方法挂载卷,返回卷路径。

如果挂载点不是卷类型,通过bind mount挂载(仅使用-v--volume参数,不含使用--mount的情况),则创建并修改源路径。

https://github.com/moby/moby/blob/v20.10.14/volume/mounts/mounts.go#L98

func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (path string, err error) {
    if m.SkipMountpointCreation {
        return m.Source, nil
    }
    ...
    if m.Volume != nil {
        id := m.ID
        if id == "" {
            id = stringid.GenerateRandomID()
        }
        path, err := m.Volume.Mount(id)
        if err != nil {
            return "", errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
        }

        m.ID = id
        m.active++
        return path, nil
    }
    ...
    if m.Type == mounttypes.TypeBind {
        ...
        if err := idtools.MkdirAllAndChownNew(m.Source, 0755, rootIDs); err != nil {
            ...
        }
    }
    return m.Source, nil
}

将挂载点添加至mounts变量中。如果挂载点的目的路径是网络相关的文件,则暂不处理,仅设置对应的容器属性。

https://github.com/moby/moby/blob/v20.10.14/daemon/volumes_unix.go#L55-L76

func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
    ...
    for _, m := range c.MountPoints {
        ...
        if !c.TrySetNetworkMount(m.Destination, path) {
            mnt := container.Mount{
                Source:      path,
                Destination: m.Destination,
                Writable:    m.RW,
                Propagation: string(m.Propagation),
            }
            if m.Spec.Type == mounttypes.TypeBind && m.Spec.BindOptions != nil {
                mnt.NonRecursive = m.Spec.BindOptions.NonRecursive
            }
            if m.Volume != nil {
                attributes := map[string]string{
                    "driver":      m.Volume.DriverName(),
                    "container":   c.ID,
                    "destination": m.Destination,
                    "read/write":  strconv.FormatBool(m.RW),
                    "propagation": string(m.Propagation),
                }
                daemon.LogVolumeEvent(m.Volume.Name(), "mount", attributes)
            }
            mounts = append(mounts, mnt)
        }
    }
    ...
}

https://github.com/moby/moby/blob/v20.10.14/container/container_unix.go#L36

func (container *Container) TrySetNetworkMount(destination string, path string) bool {
    if destination == "/etc/resolv.conf" {
        container.ResolvConfPath = path
        return true
    }
    if destination == "/etc/hostname" {
        container.HostnamePath = path
        return true
    }
    if destination == "/etc/hosts" {
        container.HostsPath = path
        return true
    }

    return false
}

对mounts变量按照目录层级进行排序,避免出现子目录先于父级目录被挂载的情况。

https://github.com/moby/moby/blob/v20.10.14/daemon/volumes_unix.go#L79

func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
    ...
	for _, m := range c.MountPoints {
        ...
    }
    mounts = sortMounts(mounts)
    ...
}

https://github.com/moby/moby/blob/v20.10.14/daemon/volumes_unix.go#L101

func sortMounts(m []container.Mount) []container.Mount {
    sort.Sort(mounts(m))
    return m
}

/etc/resolv.conf,/etc/hostname,/etc/hosts添加到挂载列表中。

https://github.com/moby/moby/blob/v20.10.14/daemon/volumes_unix.go#L80-L95

func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
    ...
    netMounts := c.NetworkMounts()
    ...
    rootIDs := daemon.idMapping.RootPair()
    for _, mnt := range netMounts {
        ...
        if strings.Index(mnt.Source, daemon.repository) == 0 {
            if err := os.Chown(mnt.Source, rootIDs.UID, rootIDs.GID); err != nil {
                return nil, err
            }
        }
    }
    return append(mounts, netMounts...), nil
}

https://github.com/moby/moby/blob/v20.10.14/container/container_unix.go#L64

func (container *Container) NetworkMounts() []Mount {
    var mounts []Mount
    shared := container.HostConfig.NetworkMode.IsContainer()
    parser := volumemounts.NewParser(container.OS)
    if container.ResolvConfPath != "" {
        if _, err := os.Stat(container.ResolvConfPath); err != nil {
            logrus.Warnf("ResolvConfPath set to %q, but can't stat this filename (err = %v); skipping", container.ResolvConfPath, err)
        } else {
            writable := !container.HostConfig.ReadonlyRootfs
            if m, exists := container.MountPoints["/etc/resolv.conf"]; exists {
                writable = m.RW
            } else {
                label.Relabel(container.ResolvConfPath, container.MountLabel, shared)
            }
            mounts = append(mounts, Mount{
                Source:      container.ResolvConfPath,
                Destination: "/etc/resolv.conf",
                Writable:    writable,
                Propagation: string(parser.DefaultPropagationMode()),
            })
        }
    }
    if container.HostnamePath != "" {
        if _, err := os.Stat(container.HostnamePath); err != nil {
            logrus.Warnf("HostnamePath set to %q, but can't stat this filename (err = %v); skipping", container.HostnamePath, err)
        } else {
            writable := !container.HostConfig.ReadonlyRootfs
            if m, exists := container.MountPoints["/etc/hostname"]; exists {
                writable = m.RW
            } else {
                label.Relabel(container.HostnamePath, container.MountLabel, shared)
            }
            mounts = append(mounts, Mount{
                Source:      container.HostnamePath,
                Destination: "/etc/hostname",
                Writable:    writable,
                Propagation: string(parser.DefaultPropagationMode()),
            })
        }
    }
    if container.HostsPath != "" {
        if _, err := os.Stat(container.HostsPath); err != nil {
            logrus.Warnf("HostsPath set to %q, but can't stat this filename (err = %v); skipping", container.HostsPath, err)
        } else {
            writable := !container.HostConfig.ReadonlyRootfs
            if m, exists := container.MountPoints["/etc/hosts"]; exists {
                writable = m.RW
            } else {
                label.Relabel(container.HostsPath, container.MountLabel, shared)
            }
            mounts = append(mounts, Mount{
                Source:      container.HostsPath,
                Destination: "/etc/hosts",
                Writable:    writable,
                Propagation: string(parser.DefaultPropagationMode()),
            })
        }
    }
    return mounts
}

4.11.5 添加IPC挂载点

将IPC挂载点添加到mount列表中。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L509-L511

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
            ms = append(ms, c.IpcMounts()...)
        }
        ...
    }
}

https://github.com/moby/moby/blob/v20.10.14/container/container_unix.go#L200

func (container *Container) IpcMounts() []Mount {
    var mounts []Mount
    parser := volumemounts.NewParser(container.OS)

    if container.HasMountFor("/dev/shm") {
        return mounts
    }
    if container.ShmPath == "" {
        return mounts
    }

    label.SetFileLabel(container.ShmPath, container.MountLabel)
    mounts = append(mounts, Mount{
        Source:      container.ShmPath,
        Destination: "/dev/shm",
        Writable:    true,
        Propagation: string(parser.DefaultPropagationMode()),
    })

    return mounts
}

4.11.6 TmpfsMounts(): 添加用户设置的tmpfs挂载点

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L513-L517

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        tmpfsMounts, err := c.TmpfsMounts()
        if err != nil {
            return err
        }
        ms = append(ms, tmpfsMounts...)
        ...
    }
}

参考docker官方文档,用户可以通过--tmpfs--mount type=tmpfs,destination=参数指定要挂载的tmpfs。

https://github.com/moby/moby/blob/v20.10.14/container/container_unix.go#L421

func (container *Container) TmpfsMounts() ([]Mount, error) {
    parser := volumemounts.NewParser(container.OS)
    var mounts []Mount
    for dest, data := range container.HostConfig.Tmpfs {
        mounts = append(mounts, Mount{
            Source:      "tmpfs",
            Destination: dest,
            Data:        data,
        })
    }
    for dest, mnt := range container.MountPoints {
        if mnt.Type == mounttypes.TypeTmpfs {
            data, err := parser.ConvertTmpfsOptions(mnt.Spec.TmpfsOptions, mnt.Spec.ReadOnly)
            if err != nil {
                return nil, err
            }
            mounts = append(mounts, Mount{
                Source:      "tmpfs",
                Destination: dest,
                Data:        data,
            })
        }
    }
    return mounts, nil
}

4.11.7 添加secret挂载点

在4.11.3节中,已完成对secret工作目录的设置,现将secret挂载点添加至mount列表中。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L519-L523

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        secretMounts, err := c.SecretMounts()
        if err != nil {
            return err
        }
        ms = append(ms, secretMounts...)
        ...
    }
}

https://github.com/moby/moby/blob/v20.10.14/container/container_unix.go#L223

func (container *Container) SecretMounts() ([]Mount, error) {
    var mounts []Mount
    for _, r := range container.SecretReferences {
        if r.File == nil {
            continue
        }
        src, err := container.SecretFilePath(*r)
        if err != nil {
            return nil, err
        }
        mounts = append(mounts, Mount{
            Source:      src,
            Destination: getSecretTargetPath(r),
            Writable:    false,
        })
    }
    for _, r := range container.ConfigReferences {
        fPath, err := container.ConfigFilePath(*r)
        if err != nil {
            return nil, err
        }
        mounts = append(mounts, Mount{
            Source:      fPath,
            Destination: getConfigTargetPath(r),
            Writable:    false,
        })
    }

    return mounts, nil
}

其中secret,config的目的路径分别为/run/secrets/

https://github.com/moby/moby/blob/v20.10.14/container/container_unix.go#L30-L31

const (
    ...
    containerConfigMountPath = "/"
    containerSecretMountPath = "/run/secrets"
)

4.11.8 合并用户指定的挂载点和默认挂载点

首先对挂载点排序,避免出现子目录先于父级目录被挂载的情况。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L525

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        sort.Sort(mounts(ms))
        ...
    }
}

更新已有的oci配置中的mount列表,移除以下挂载点:

  • 用户指定的相同路径的挂载点
  • 用户挂载了dev目录,则移除dev目录下所有的挂载点
  • 如果用户设置IpcMode为空,则移除/dev/shm

另外设置了/dev/shm挂载选项中的size选项。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L527-L565

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        mounts := ms

        userMounts := make(map[string]struct{})
        for _, m := range mounts {
            userMounts[m.Destination] = struct{}{}
        }

        defaultMounts := s.Mounts[:0]
        _, mountDev := userMounts["/dev"]
        for _, m := range s.Mounts {
            if _, ok := userMounts[m.Destination]; ok {
                // filter out mount overridden by a user supplied mount
                continue
            }
            if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
                // filter out everything under /dev if /dev is user-mounted
                continue
            }

            if m.Destination == "/dev/shm" {
                if c.HostConfig.IpcMode.IsNone() {
                    // filter out /dev/shm for "none" IpcMode
                    continue
                }
                // set size for /dev/shm mount from spec
                sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
                m.Options = append(m.Options, sizeOpt)
            }

            defaultMounts = append(defaultMounts, m)
        }

        s.Mounts = defaultMounts
        ...
    }
}

再遍历用户指定的挂载点,合并到oci配置中。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L566

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        for _, m := range mounts {
            ...
        }
        ...
    }
}

如果挂载源是tmpfs,需要额外添加一些挂载选项。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L567-L582

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        for _, m := range mounts {
            if m.Source == "tmpfs" {
                data := m.Data
                parser := volumemounts.NewParser("linux")
                options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
                if data != "" {
                    options = append(options, strings.Split(data, ",")...)
                }

                merged, err := mount.MergeTmpfsOptions(options)
                if err != nil {
                    return err
                }

                s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
                continue
            }
            ...
        }
        ...
    }
}

对其他类型挂载源,使用bind mount,并设置rootfs的传播模式。

  • 如果挂载点为SHAREDRSHARED模式,则将rootfs的传播模式也设置为SHARED
  • 如果挂载点为SLAVERSLAVE模式,则将rootfs的传播模式设置为RSLAVE

如果开启了userns, 还要另外保留一些挂载选项。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L584-L660

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...
        for _, m := range mounts {
            ...
            mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
            ...
            pFlag := mountPropagationMap[m.Propagation]
            switch pFlag {
            case mount.SHARED, mount.RSHARED:
                if err := ensureShared(m.Source); err != nil {
                    return err
                }
                rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
                if rootpg != mount.SHARED && rootpg != mount.RSHARED {
                    s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
                }
            case mount.SLAVE, mount.RSLAVE:
                var fallback bool
                if err := ensureSharedOrSlave(m.Source); err != nil {
                    ...
                    fallback = true
                    ...
                }
                if !fallback {
                    rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
                    if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
                        s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
                    }
                }
            }

            bindMode := "rbind"
            if m.NonRecursive {
                bindMode = "bind"
            }
            opts := []string{bindMode}
            if !m.Writable {
                opts = append(opts, "ro")
            }
            if pFlag != 0 {
                opts = append(opts, mountPropagationReverseMap[pFlag])
            }
            ...
            if daemon.configStore.RemappedRoot != "" || sys.RunningInUserNS() {
                unprivOpts, err := getUnprivilegedMountFlags(m.Source)
                if err != nil {
                    return err
                }
                opts = append(opts, unprivOpts...)
            }

            mt.Options = opts
            s.Mounts = append(s.Mounts, mt)
        }
        ...
    }
}

4.11.9 只读挂载选项

用户可使用docker run --read-only选项,设置容器的rootfs为只读。如果配置了该选项,除了部分挂载点及用户指定的挂载点,docker默认的挂载点都将被设置只读选项。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L663-L675

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...        
        if s.Root.Readonly {
            for i, m := range s.Mounts {
                switch m.Destination {
                case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
                    continue
                }
                if _, ok := userMounts[m.Destination]; !ok {
                    if !inSlice(m.Options, "ro") {
                        s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
                    }
                }
            }
        }
        ...
    }
}

如果用户设置了特权选项--privileged, 则移除对/sys挂载点的只读选项。

因为/proc/sys目录下有一些敏感的文件,docker默认通过oci的Linux.ReadonlyPathsLinux.MaskedPaths配置对相关路径进行了保护。如果开启了特权选项,也一并移除。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L677-L686

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...  
        if c.HostConfig.Privileged {
            // clear readonly for /sys
            for i := range s.Mounts {
                if s.Mounts[i].Destination == "/sys" {
                    clearReadOnly(&s.Mounts[i])
                }
            }
            s.Linux.ReadonlyPaths = nil
            s.Linux.MaskedPaths = nil
        }
        ...
    }
}

如果用户开启了user namespace, 或特权选项,移除cgroup的只读保护。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L690-L696

func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
        ...  
        if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
            for i, m := range s.Mounts {
                if m.Type == "cgroup" {
                    clearReadOnly(&s.Mounts[i])
                }
            }
        }

        return nil

    }
}

4.12 设置libnetwork hook: WithLibnetwork

向runc的prestart hook中添加libnetwork-setkey命令。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L64

func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        if s.Hooks == nil {
            s.Hooks = &specs.Hooks{}
        }
        for _, ns := range s.Linux.Namespaces {
            if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
                target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
                shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
                s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
                    Path: target,
                    Args: []string{
                        "libnetwork-setkey",
                        "-exec-root=" + daemon.configStore.GetExecRoot(),
                        c.ID,
                        shortNetCtlrID,
                    },
                })
            }
        }
        return nil
    }
}

其中libnetwork-setkey是以reexec模式实现的docker子命令,关于该命令实现的源码分析,我们在TODO: docker reexec libnetwork-setkey子命令源码分析一文中详细展开。

4.13 设置oci配置中的apparmor规则:WithApparmor

将容器的apparmor规则传递至oci配置中。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L129

func WithApparmor(c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        if apparmor.IsEnabled() {
            var appArmorProfile string
            if c.AppArmorProfile != "" {
                appArmorProfile = c.AppArmorProfile
            } else if c.HostConfig.Privileged {
                appArmorProfile = unconfinedAppArmorProfile
            } else {
                appArmorProfile = defaultAppArmorProfile
            }

            if appArmorProfile == defaultAppArmorProfile {
                if err := ensureDefaultAppArmorProfile(); err != nil {
                    return err
                }
            }
            s.Process.ApparmorProfile = appArmorProfile
        }
        return nil
    }
}

4.14 设置oci配置中的selinux标签:WithSelinux

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L120

func WithSelinux(c *container.Container) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        s.Process.SelinuxLabel = c.GetProcessLabel()
        s.Linux.MountLabel = c.MountLabel
        return nil
    }
}

4.15 设置OOM优先级: WithOOMScore

为了便于控制,内核引入了/proc/<pid>/oom_adj,以避免系统中的重要进程被杀死,并定义了进程被杀死的顺序。

docker可以通过--oom-score-adj选项设置该参数。此处,将该配置传递给oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L1031

func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
    ...
    opts = append(opts,
        ...
        WithOOMScore(&c.HostConfig.OomScoreAdj),
    )

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L112

func WithOOMScore(score *int) coci.SpecOpts {
    return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        s.Process.OOMScoreAdj = score
        return nil
    }
}

4.16 NoNewPrivileges

如果通过dockerd的--no-new-privileges参数,或容器创建时的--security-opt=no-new-privileges参数,可以使容器内普通进程无法通过suid/sgid,文件capability等方式获得父进程没有的权限。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L1033-L1035

func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
    ...
    if c.NoNewPrivileges {
        opts = append(opts, coci.WithNoNewPrivileges)
    }
    ...
}

此处将该选项传递给oci配置。

https://github.com/moby/moby/blob/v20.10.14/vendor/github.com/containerd/containerd/oci/spec_opts.go#L416

func WithNoNewPrivileges(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
    setProcess(s)
    s.Process.NoNewPrivileges = true
    return nil
}

4.17 设置保护路径

将受保护的系统路径传递至oci配置。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L1038-L1043

func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
    ...
    if c.HostConfig.MaskedPaths != nil {
        opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
    }
    if c.HostConfig.ReadonlyPaths != nil {
        opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
    }
    ...
}

4.18 rootless cgroup

如果dockerd运行在rootless模式, 更新cgroup规则。

目前,在rootless模式下,cgroup相关的资源限制选项仅支持cgroup v2和systemd

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L1044-L1046

func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
    ...
    if daemon.configStore.Rootless {
        opts = append(opts, WithRootless(daemon))
    }
    ...
}

根据rootless运行的用户支持的controller,更新oci中的配置。其中,可用的controller,从文件/sys/fs/cgroup/user.slice/user-$(id -u).slice/user@$(id -u).service/cgroup.controllers中读取,默认仅支持memory, pids。

https://github.com/moby/moby/blob/v20.10.14/daemon/oci_linux.go#L89

func WithRootless(daemon *Daemon) coci.SpecOpts {
    return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
        var v2Controllers []string
        if daemon.getCgroupDriver() == cgroupSystemdDriver {
            if cdcgroups.Mode() != cdcgroups.Unified {
                return errors.New("rootless systemd driver doesn't support cgroup v1")
            }
            rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
            if rootlesskitParentEUID == "" {
                return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
            }
            controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%s.slice/cgroup.controllers", rootlesskitParentEUID)
            controllersFile, err := ioutil.ReadFile(controllersPath)
            if err != nil {
                return err
            }
            v2Controllers = strings.Fields(string(controllersFile))
        }
        return specconv.ToRootless(s, v2Controllers)
    }
}

https://github.com/moby/moby/blob/v20.10.14/rootless/specconv/specconv_linux.go#L17

func ToRootless(spec *specs.Spec, v2Controllers []string) error {
    return toRootless(spec, v2Controllers, getCurrentOOMScoreAdj())
}

在oci配置中移除不支持的资源限制策略。

https://github.com/moby/moby/blob/v20.10.14/rootless/specconv/specconv_linux.go#L36

func toRootless(spec *specs.Spec, v2Controllers []string, currentOOMScoreAdj int) error {
    if len(v2Controllers) == 0 {
        // Remove cgroup settings.
        spec.Linux.Resources = nil
        spec.Linux.CgroupsPath = ""
    } else {
        if spec.Linux.Resources != nil {
            m := make(map[string]struct{})
            for _, s := range v2Controllers {
                m[s] = struct{}{}
            }
            // Remove devices: https://github.com/containers/crun/issues/255
            spec.Linux.Resources.Devices = nil
            if _, ok := m["memory"]; !ok {
                spec.Linux.Resources.Memory = nil
            }
            if _, ok := m["cpu"]; !ok {
                spec.Linux.Resources.CPU = nil
            }
            if _, ok := m["cpuset"]; !ok {
                if spec.Linux.Resources.CPU != nil {
                    spec.Linux.Resources.CPU.Cpus = ""
                    spec.Linux.Resources.CPU.Mems = ""
                }
            }
            if _, ok := m["pids"]; !ok {
                spec.Linux.Resources.Pids = nil
            }
            if _, ok := m["io"]; !ok {
                spec.Linux.Resources.BlockIO = nil
            }
            if _, ok := m["rdma"]; !ok {
                spec.Linux.Resources.Rdma = nil
            }
            spec.Linux.Resources.HugepageLimits = nil
            spec.Linux.Resources.Network = nil
        }
    }

    if spec.Process.OOMScoreAdj != nil && *spec.Process.OOMScoreAdj < currentOOMScoreAdj {
        *spec.Process.OOMScoreAdj = currentOOMScoreAdj
    }
    return nil
}