tags: k8s,kubernetes,container,源码分析

k8s volume plugin 源码分析: Local卷 SetUp 实现

1. 调用点

在VolumeManager中挂载卷时,执行SetUp方法,将卷挂载至pod内。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/util/operationexecutor/operation_generator.go#L725

func (og *operationGenerator) GenerateMountVolumeFunc(...) volumetypes.GeneratedOperations {
    ...
    mountVolumeFunc := func() volumetypes.OperationContext {
        ...
        // Execute mount
        mountErr := volumeMounter.SetUp(volume.MounterArgs{
            FsUser:              util.FsUserFrom(volumeToMount.Pod),
            FsGroup:             fsGroup,
            DesiredSize:         volumeToMount.DesiredSizeLimit,
            FSGroupChangePolicy: fsGroupChangePolicy,
        })
        ...
        markVolMountedErr := actualStateOfWorld.MarkVolumeAsMounted(markOpts)
        ...
    }
    ...
}

不是所有类型的卷,都实现了SetUp方法。不同类型的卷的实现也有所不同,这里以local卷的实现为例进行分析。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L521

func (m *localVolumeMounter) SetUp(mounterArgs volume.MounterArgs) error {
    return m.SetUpAt(m.GetPath(), mounterArgs)
}

func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
    ...
}

其中,m.GetPath得到路径一般为 /var/lib/kubelet/pods/{POD_UID}/volumes/kubernetes.io/local-volume/{VOLUME_NAME}, 这就是最终要把卷挂载到的目的路径。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L493

func (l *localVolume) GetPath() string {
    return l.plugin.host.GetPodVolumeDir(l.podUID, utilstrings.EscapeQualifiedName(localVolumePluginName), l.volName)
}

2. 校验路径穿越

要求globalPath不能目录穿越。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L534

func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
    ...
    err := validation.ValidatePathNoBacksteps(m.globalPath)
    if err != nil {
        return fmt.Errorf("invalid path: %s %v", m.globalPath, err)
    }
    ...
}

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/validation/pv_validation.go#L62

func ValidatePathNoBacksteps(targetPath string) error {
    parts := strings.Split(filepath.ToSlash(targetPath), "/")
    for _, item := range parts {
        if item == ".." {
            return errors.New("must not contain '..'")
        }
    }

    return nil
}

其中,globalPath是由Pod Spec中的配置拼接而来,存在风险,历史上也确实在此处出过问题, 该问题的修复引入了此校验。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L120

func (plugin *localVolumePlugin) NewMounter(spec *volume.Spec, pod *v1.Pod, _ volume.VolumeOptions) (volume.Mounter, error) {
    ...
    globalLocalPath, err := plugin.getGlobalLocalPath(spec)
    ...
}

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L268

func (plugin *localVolumePlugin) getGlobalLocalPath(spec *volume.Spec) (string, error) {
    ...
    kvh, ok := plugin.host.(volume.KubeletVolumeHost)
    if !ok {
        return "", fmt.Errorf("plugin volume host does not implement KubeletVolumeHost interface")
    }

    fileType, err := kvh.GetHostUtil().GetFileType(spec.PersistentVolume.Spec.Local.Path)
    if err != nil {
        return "", err
    }
    switch fileType {
    case hostutil.FileTypeDirectory:
        return spec.PersistentVolume.Spec.Local.Path, nil
    case hostutil.FileTypeBlockDev:
        return filepath.Join(plugin.generateBlockDeviceBaseGlobalPath(), spec.Name()), nil
    default:
        return "", fmt.Errorf("only directory and block device are supported")
    }
}

3. 避免重复挂载

func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
    ...
    notMnt, err := mount.IsNotMountPoint(m.mounter, dir)
    ...    
    if err != nil && !os.IsNotExist(err) {
        klog.Errorf("cannot validate mount point: %s %v", dir, err)
        return err
    }

    if !notMnt {
        return nil
    }
    ...
}

4. bind mount

使用bind mount, 是为了允许一个卷被挂载至多个pod。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L587

func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
    ...
        if err := os.MkdirAll(dir, 0750); err != nil {
            ...
            return err
        }
    ...
    options := []string{"bind"}
    if m.readOnly {
        options = append(options, "ro")
    }
    mountOptions := util.JoinMountOptions(options, m.mountOptions)
    ...
    err = m.mounter.MountSensitiveWithoutSystemd(globalPath, dir, "", mountOptions, nil)
    if err != nil {
        ...
        notMnt, mntErr := mount.IsNotMountPoint(m.mounter, dir)
        ...
        if !notMnt {
            if mntErr = m.mounter.Unmount(dir); mntErr != nil {
                klog.Errorf("Failed to unmount: %v", mntErr)
                return err
            }
            ...
        }
        if rmErr := os.Remove(dir); rmErr != nil {
            klog.Warningf("failed to remove %s: %v", dir, rmErr)
        }
        return err
    }
    ...
}

https://github.com/kubernetes/kubernetes/blob/v1.23.1/staging/src/k8s.io/mount-utils/mount_linux.go#L113

func (mounter *Mounter) MountSensitiveWithoutSystemd(source string, target string, fstype string, options []string, sensitiveOptions []string) error {
	return mounter.MountSensitiveWithoutSystemdWithMountFlags(source, target, fstype, options, sensitiveOptions, nil /* mountFlags */)
}

bind mount,需要执行两次mount。remount一次是为了应用挂载选项,因为bind mount可能会忽略一些选项;

https://github.com/kubernetes/kubernetes/blob/v1.23.1/staging/src/k8s.io/mount-utils/mount_linux.go#L118

func (mounter *Mounter) MountSensitiveWithoutSystemdWithMountFlags(source string, target string, fstype string, options []string, sensitiveOptions []string, mountFlags []string) error {
    ...
    bind, bindOpts, bindRemountOpts, bindRemountOptsSensitive := MakeBindOptsSensitive(options, sensitiveOptions)
    if bind {
        err := mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, bindOpts, bindRemountOptsSensitive, mountFlags, false)
        if err != nil {
            return err
        }
        return mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, bindRemountOpts, bindRemountOptsSensitive, mountFlags, false)
    }
    ...
    return mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, options, sensitiveOptions, mountFlags, false)
}

Mounter.doMount调用MakeMountArgsSensitiveWithMountFlags函数组装命令,然后执行。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/staging/src/k8s.io/mount-utils/mount_linux.go#L144

func (mounter *Mounter) doMount(mounterPath string, mountCmd string, source string, target string, fstype string, options []string, sensitiveOptions []string, mountFlags []string, systemdMountRequired bool) error {
    mountArgs, mountArgsLogStr := MakeMountArgsSensitiveWithMountFlags(source, target, fstype, options, sensitiveOptions, mountFlags)
    if len(mounterPath) > 0 {
        mountArgs = append([]string{mountCmd}, mountArgs...)
        mountArgsLogStr = mountCmd + " " + mountArgsLogStr
        mountCmd = mounterPath
    }
    ...
    command := exec.Command(mountCmd, mountArgs...)
    output, err := command.CombinedOutput()
    ...
    return err
}

以挂载/dev/vdb为例

  1. 第一次将执行mount -o bind /var/lib/kubelet/plugins/kubernetes.io/local-volume/mounts/example-local-filesystem-pv /var/lib/kubelet/pods/53a1376e-80ed-4cbf-8c6f-dc52176b6938/volumes/kubernetes.io~local-volume/example-local-filesystem-pv
  2. 第二次将执行mount -o bind,remount /var/lib/kubelet/plugins/kubernetes.io/local-volume/mounts/example-local-filesystem-pv /var/lib/kubelet/pods/53a1376e-80ed-4cbf-8c6f-dc52176b6938/volumes/kubernetes.io~local-volume/example-local-filesystem-pv

这里我们举的例子没有额外的挂载选项,导致两次mount命令几乎相同,如果有额外的挂载选项,则将在第二次mount时,有所区别。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/staging/src/k8s.io/mount-utils/mount_linux.go#L243

func MakeMountArgsSensitiveWithMountFlags(source, target, fstype string, options []string, sensitiveOptions []string, mountFlags []string) (mountArgs []string, mountArgsLogStr string) {
	// Build mount command as follows:
	//   mount [$mountFlags] [-t $fstype] [-o $options] [$source] $target
    mountArgs = []string{}
    ...
    mountArgs = append(mountArgs, mountFlags...)
    ...
    if len(fstype) > 0 {
        mountArgs = append(mountArgs, "-t", fstype)
        ...
    }
    if len(options) > 0 || len(sensitiveOptions) > 0 {
        combinedOptions := []string{}
        combinedOptions = append(combinedOptions, options...)
        combinedOptions = append(combinedOptions, sensitiveOptions...)
        mountArgs = append(mountArgs, "-o", strings.Join(combinedOptions, ","))
        ...
    }
    if len(source) > 0 {
        mountArgs = append(mountArgs, source)
        ...
    }
    mountArgs = append(mountArgs, target)
    ...
    return mountArgs, mountArgsLogStr
}

5. 设置文件组用户

最后一步是设置文件的组属主。SetVolumeOwnership修改给定的卷,使其为fsGroup所有,并设置SetGid,使新创建的文件为fsGroup所有。如果fsGroup为nil,则不做任何事情。

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L619

func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
    ...
    if !m.readOnly {
        // Volume owner will be written only once on the first volume mount
        if len(refs) == 0 {
            return volume.SetVolumeOwnership(m, mounterArgs.FsGroup, mounterArgs.FSGroupChangePolicy, util.FSGroupCompleteHook(m.plugin, nil))
        }
    }
    return nil
}

https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/volume_linux.go#L43

func SetVolumeOwnership(mounter Mounter, fsGroup *int64, fsGroupChangePolicy *v1.PodFSGroupChangePolicy, completeFunc func(types.CompleteFuncParam)) error {
    if fsGroup == nil {
        return nil
    }
    ...
    if skipPermissionChange(mounter, fsGroup, fsGroupChangePolicy) {
        klog.V(3).InfoS("Skipping permission and ownership change for volume", "path", mounter.GetPath())
        return nil
    }

    err := walkDeep(mounter.GetPath(), func(path string, info os.FileInfo, err error) error {
        if err != nil {
            return err
        }
        return changeFilePermission(path, fsGroup, mounter.GetAttributes().ReadOnly, info)
    })
    if completeFunc != nil {
        completeFunc(types.CompleteFuncParam{
            Err: &err,
        })
    }
    return err
}

os.Lchown() 方法用于更改文件所有者,类似 chown,但是不追踪链接。uid为-1表示不修改uid。

os.Chmod(filename, info.Mode()|mask) 表示确保拥有者有读写权限。

func changeFilePermission(filename string, fsGroup *int64, readonly bool, info os.FileInfo) error {
    err := os.Lchown(filename, -1, int(*fsGroup))
    if err != nil {
        klog.ErrorS(err, "Lchown failed", "path", filename)
    }

    if info.Mode()&os.ModeSymlink != 0 {
        return nil
    }

    mask := rwMask
    if readonly {
        mask = roMask
    }

    if info.IsDir() {
        mask |= os.ModeSetgid
        mask |= execMask
    }

    err = os.Chmod(filename, info.Mode()|mask)
    if err != nil {
        klog.ErrorS(err, "Chown failed", "path", filename)
    }

    return nil
}

附:部署local卷yaml

apiVersion: v1
kind: Pod
metadata:
  name: test-local-filesystem-pod
spec:
  containers:
  - image: ubuntu
    name: local-filesystem-container
    tty: true
    volumeMounts:
    - mountPath: /mnt
      name: mypd
  volumes:
    - name: mypd
      persistentVolumeClaim:
        claimName: example-local-system-filesystem-claim

---
apiVersion: v1
kind: PersistentVolume
metadata:
  name: example-local-filesystem-pv
spec:
  capacity:
    storage: 1Gi
  volumeMode: Filesystem
  accessModes:
  - ReadWriteOnce
  persistentVolumeReclaimPolicy: Delete
  storageClassName: local-system-storage
  local:
    path: /dev/vdb
  nodeAffinity:
    required:
      nodeSelectorTerms:
      - matchExpressions:
        - key: kubernetes.io/hostname
          operator: In
          values:
          - wanglei-k8s-containerd

---
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
  name: local-system-storage
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: example-local-system-filesystem-claim
spec:
  volumeMode: Filesystem
  storageClassName: local-system-storage
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 1Gi