k8s volume plugin 源码分析: Local卷 SetUp 实现
tags: k8s,kubernetes,container,源码分析
k8s volume plugin 源码分析: Local卷 SetUp 实现
1. 调用点
在VolumeManager中挂载卷时,执行SetUp
方法,将卷挂载至pod内。
func (og *operationGenerator) GenerateMountVolumeFunc(...) volumetypes.GeneratedOperations {
...
mountVolumeFunc := func() volumetypes.OperationContext {
...
// Execute mount
mountErr := volumeMounter.SetUp(volume.MounterArgs{
FsUser: util.FsUserFrom(volumeToMount.Pod),
FsGroup: fsGroup,
DesiredSize: volumeToMount.DesiredSizeLimit,
FSGroupChangePolicy: fsGroupChangePolicy,
})
...
markVolMountedErr := actualStateOfWorld.MarkVolumeAsMounted(markOpts)
...
}
...
}
不是所有类型的卷,都实现了SetUp方法。不同类型的卷的实现也有所不同,这里以local卷的实现为例进行分析。
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L521
func (m *localVolumeMounter) SetUp(mounterArgs volume.MounterArgs) error {
return m.SetUpAt(m.GetPath(), mounterArgs)
}
func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
...
}
其中,m.GetPath
得到路径一般为
/var/lib/kubelet/pods/{POD_UID}/volumes/kubernetes.io/local-volume/{VOLUME_NAME}
, 这就是最终要把卷挂载到的目的路径。
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L493
func (l *localVolume) GetPath() string {
return l.plugin.host.GetPodVolumeDir(l.podUID, utilstrings.EscapeQualifiedName(localVolumePluginName), l.volName)
}
2. 校验路径穿越
要求globalPath
不能目录穿越。
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L534
func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
...
err := validation.ValidatePathNoBacksteps(m.globalPath)
if err != nil {
return fmt.Errorf("invalid path: %s %v", m.globalPath, err)
}
...
}
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/validation/pv_validation.go#L62
func ValidatePathNoBacksteps(targetPath string) error {
parts := strings.Split(filepath.ToSlash(targetPath), "/")
for _, item := range parts {
if item == ".." {
return errors.New("must not contain '..'")
}
}
return nil
}
其中,globalPath
是由Pod Spec中的配置拼接而来,存在风险,历史上也确实在此处出过问题, 该问题的修复引入了此校验。
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L120
func (plugin *localVolumePlugin) NewMounter(spec *volume.Spec, pod *v1.Pod, _ volume.VolumeOptions) (volume.Mounter, error) {
...
globalLocalPath, err := plugin.getGlobalLocalPath(spec)
...
}
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L268
func (plugin *localVolumePlugin) getGlobalLocalPath(spec *volume.Spec) (string, error) {
...
kvh, ok := plugin.host.(volume.KubeletVolumeHost)
if !ok {
return "", fmt.Errorf("plugin volume host does not implement KubeletVolumeHost interface")
}
fileType, err := kvh.GetHostUtil().GetFileType(spec.PersistentVolume.Spec.Local.Path)
if err != nil {
return "", err
}
switch fileType {
case hostutil.FileTypeDirectory:
return spec.PersistentVolume.Spec.Local.Path, nil
case hostutil.FileTypeBlockDev:
return filepath.Join(plugin.generateBlockDeviceBaseGlobalPath(), spec.Name()), nil
default:
return "", fmt.Errorf("only directory and block device are supported")
}
}
3. 避免重复挂载
func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
...
notMnt, err := mount.IsNotMountPoint(m.mounter, dir)
...
if err != nil && !os.IsNotExist(err) {
klog.Errorf("cannot validate mount point: %s %v", dir, err)
return err
}
if !notMnt {
return nil
}
...
}
4. bind mount
使用bind mount, 是为了允许一个卷被挂载至多个pod。
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L587
func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
...
if err := os.MkdirAll(dir, 0750); err != nil {
...
return err
}
...
options := []string{"bind"}
if m.readOnly {
options = append(options, "ro")
}
mountOptions := util.JoinMountOptions(options, m.mountOptions)
...
err = m.mounter.MountSensitiveWithoutSystemd(globalPath, dir, "", mountOptions, nil)
if err != nil {
...
notMnt, mntErr := mount.IsNotMountPoint(m.mounter, dir)
...
if !notMnt {
if mntErr = m.mounter.Unmount(dir); mntErr != nil {
klog.Errorf("Failed to unmount: %v", mntErr)
return err
}
...
}
if rmErr := os.Remove(dir); rmErr != nil {
klog.Warningf("failed to remove %s: %v", dir, rmErr)
}
return err
}
...
}
func (mounter *Mounter) MountSensitiveWithoutSystemd(source string, target string, fstype string, options []string, sensitiveOptions []string) error {
return mounter.MountSensitiveWithoutSystemdWithMountFlags(source, target, fstype, options, sensitiveOptions, nil /* mountFlags */)
}
bind mount,需要执行两次mount。remount一次是为了应用挂载选项,因为bind mount可能会忽略一些选项;
func (mounter *Mounter) MountSensitiveWithoutSystemdWithMountFlags(source string, target string, fstype string, options []string, sensitiveOptions []string, mountFlags []string) error {
...
bind, bindOpts, bindRemountOpts, bindRemountOptsSensitive := MakeBindOptsSensitive(options, sensitiveOptions)
if bind {
err := mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, bindOpts, bindRemountOptsSensitive, mountFlags, false)
if err != nil {
return err
}
return mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, bindRemountOpts, bindRemountOptsSensitive, mountFlags, false)
}
...
return mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, options, sensitiveOptions, mountFlags, false)
}
Mounter.doMount
调用MakeMountArgsSensitiveWithMountFlags
函数组装命令,然后执行。
func (mounter *Mounter) doMount(mounterPath string, mountCmd string, source string, target string, fstype string, options []string, sensitiveOptions []string, mountFlags []string, systemdMountRequired bool) error {
mountArgs, mountArgsLogStr := MakeMountArgsSensitiveWithMountFlags(source, target, fstype, options, sensitiveOptions, mountFlags)
if len(mounterPath) > 0 {
mountArgs = append([]string{mountCmd}, mountArgs...)
mountArgsLogStr = mountCmd + " " + mountArgsLogStr
mountCmd = mounterPath
}
...
command := exec.Command(mountCmd, mountArgs...)
output, err := command.CombinedOutput()
...
return err
}
以挂载/dev/vdb
为例
- 第一次将执行
mount -o bind /var/lib/kubelet/plugins/kubernetes.io/local-volume/mounts/example-local-filesystem-pv /var/lib/kubelet/pods/53a1376e-80ed-4cbf-8c6f-dc52176b6938/volumes/kubernetes.io~local-volume/example-local-filesystem-pv
- 第二次将执行
mount -o bind,remount /var/lib/kubelet/plugins/kubernetes.io/local-volume/mounts/example-local-filesystem-pv /var/lib/kubelet/pods/53a1376e-80ed-4cbf-8c6f-dc52176b6938/volumes/kubernetes.io~local-volume/example-local-filesystem-pv
这里我们举的例子没有额外的挂载选项,导致两次mount命令几乎相同,如果有额外的挂载选项,则将在第二次mount时,有所区别。
func MakeMountArgsSensitiveWithMountFlags(source, target, fstype string, options []string, sensitiveOptions []string, mountFlags []string) (mountArgs []string, mountArgsLogStr string) {
// Build mount command as follows:
// mount [$mountFlags] [-t $fstype] [-o $options] [$source] $target
mountArgs = []string{}
...
mountArgs = append(mountArgs, mountFlags...)
...
if len(fstype) > 0 {
mountArgs = append(mountArgs, "-t", fstype)
...
}
if len(options) > 0 || len(sensitiveOptions) > 0 {
combinedOptions := []string{}
combinedOptions = append(combinedOptions, options...)
combinedOptions = append(combinedOptions, sensitiveOptions...)
mountArgs = append(mountArgs, "-o", strings.Join(combinedOptions, ","))
...
}
if len(source) > 0 {
mountArgs = append(mountArgs, source)
...
}
mountArgs = append(mountArgs, target)
...
return mountArgs, mountArgsLogStr
}
5. 设置文件组用户
最后一步是设置文件的组属主。SetVolumeOwnership
修改给定的卷,使其为fsGroup所有,并设置SetGid,使新创建的文件为fsGroup所有。如果fsGroup为nil,则不做任何事情。
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/local/local.go#L619
func (m *localVolumeMounter) SetUpAt(dir string, mounterArgs volume.MounterArgs) error {
...
if !m.readOnly {
// Volume owner will be written only once on the first volume mount
if len(refs) == 0 {
return volume.SetVolumeOwnership(m, mounterArgs.FsGroup, mounterArgs.FSGroupChangePolicy, util.FSGroupCompleteHook(m.plugin, nil))
}
}
return nil
}
https://github.com/kubernetes/kubernetes/blob/v1.23.1/pkg/volume/volume_linux.go#L43
func SetVolumeOwnership(mounter Mounter, fsGroup *int64, fsGroupChangePolicy *v1.PodFSGroupChangePolicy, completeFunc func(types.CompleteFuncParam)) error {
if fsGroup == nil {
return nil
}
...
if skipPermissionChange(mounter, fsGroup, fsGroupChangePolicy) {
klog.V(3).InfoS("Skipping permission and ownership change for volume", "path", mounter.GetPath())
return nil
}
err := walkDeep(mounter.GetPath(), func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
return changeFilePermission(path, fsGroup, mounter.GetAttributes().ReadOnly, info)
})
if completeFunc != nil {
completeFunc(types.CompleteFuncParam{
Err: &err,
})
}
return err
}
os.Lchown()
方法用于更改文件所有者,类似 chown,但是不追踪链接。uid为-1表示不修改uid。
os.Chmod(filename, info.Mode()|mask)
表示确保拥有者有读写权限。
func changeFilePermission(filename string, fsGroup *int64, readonly bool, info os.FileInfo) error {
err := os.Lchown(filename, -1, int(*fsGroup))
if err != nil {
klog.ErrorS(err, "Lchown failed", "path", filename)
}
if info.Mode()&os.ModeSymlink != 0 {
return nil
}
mask := rwMask
if readonly {
mask = roMask
}
if info.IsDir() {
mask |= os.ModeSetgid
mask |= execMask
}
err = os.Chmod(filename, info.Mode()|mask)
if err != nil {
klog.ErrorS(err, "Chown failed", "path", filename)
}
return nil
}
附:部署local卷yaml
apiVersion: v1
kind: Pod
metadata:
name: test-local-filesystem-pod
spec:
containers:
- image: ubuntu
name: local-filesystem-container
tty: true
volumeMounts:
- mountPath: /mnt
name: mypd
volumes:
- name: mypd
persistentVolumeClaim:
claimName: example-local-system-filesystem-claim
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: example-local-filesystem-pv
spec:
capacity:
storage: 1Gi
volumeMode: Filesystem
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Delete
storageClassName: local-system-storage
local:
path: /dev/vdb
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- wanglei-k8s-containerd
---
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: local-system-storage
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: example-local-system-filesystem-claim
spec:
volumeMode: Filesystem
storageClassName: local-system-storage
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi