lxd容器之GPU发现和加载

lxd gpu设备发现:

// /dev/nvidia[0-9]+
type nvidiaGpuCards struct {
	path  string
	major int
	minor int
	id    string
}

// {/dev/nvidiactl, /dev/nvidia-uvm, ...}
type nvidiaGpuDevices struct {
	path  string
	major int
	minor int
}

// /dev/dri/card0. If we detect that vendor == nvidia, then nvidia will contain
// the corresponding nvidia car, e.g. {/dev/dri/card1 --> /dev/nvidia1}.
type gpuDevice struct {
	vendorid  string
	productid string
	id        string // card id e.g. 0
	// If related devices have the same PCI address as the GPU we should
	// mount them all. Meaning if we detect /dev/dri/card0,
	// /dev/dri/controlD64, and /dev/dri/renderD128 with the same PCI
	// address, then they should all be made available in the container.
	pci    string
	nvidia nvidiaGpuCards

	path  string
	major int
	minor int
}

func (g *gpuDevice) isNvidiaGpu() bool {
	return strings.EqualFold(g.vendorid, "10de")
}

type cardIds struct {
	id  string
	pci string
}

func deviceLoadGpu() ([]gpuDevice, []nvidiaGpuDevices, error) {
	const DRI_PATH = "/sys/bus/pci/devices"
	var gpus []gpuDevice
	var nvidiaDevices []nvidiaGpuDevices
	var cards []cardIds

	ents, err := ioutil.ReadDir(DRI_PATH)
	if err != nil {
		if os.IsNotExist(err) {
			return nil, nil, nil
		}
		return nil, nil, err
	}

	isNvidia := false
	for _, ent := range ents {
		// The pci address == the name of the directory. So let's use
		// this cheap way of retrieving it.
		pciAddr := ent.Name()

		// Make sure that we are dealing with a GPU by looking whether
		// the "drm" subfolder exists.
		drm := filepath.Join(DRI_PATH, pciAddr, "drm")
		drmEnts, err := ioutil.ReadDir(drm)
		if err != nil {
			if os.IsNotExist(err) {
				continue
			}
		}

		// Retrieve vendor ID.
		vendorIdPath := filepath.Join(DRI_PATH, pciAddr, "vendor")
		vendorId, err := ioutil.ReadFile(vendorIdPath)
		if err != nil {
			if os.IsNotExist(err) {
				continue
			}
		}

		// Retrieve device ID.
		productIdPath := filepath.Join(DRI_PATH, pciAddr, "device")
		productId, err := ioutil.ReadFile(productIdPath)
		if err != nil {
			if os.IsNotExist(err) {
				continue
			}
		}

		// Store all associated subdevices, e.g. controlD64, renderD128.
		// The name of the directory == the last part of the
		// /dev/dri/controlD64 path. So ent.Name() will give us
		// controlD64.
		for _, drmEnt := range drmEnts {
			vendorTmp := strings.TrimSpace(string(vendorId))
			productTmp := strings.TrimSpace(string(productId))
			vendorTmp = strings.TrimPrefix(vendorTmp, "0x")
			productTmp = strings.TrimPrefix(productTmp, "0x")
			tmpGpu := gpuDevice{
				pci:       pciAddr,
				vendorid:  vendorTmp,
				productid: productTmp,
				path:      filepath.Join("/dev/dri", drmEnt.Name()),
			}

			majMinPath := filepath.Join(drm, drmEnt.Name(), "dev")
			majMinByte, err := ioutil.ReadFile(majMinPath)
			if err != nil {
				if os.IsNotExist(err) {
					continue
				}
			}
			majMin := strings.TrimSpace(string(majMinByte))
			majMinSlice := strings.Split(string(majMin), ":")
			if len(majMinSlice) != 2 {
				continue
			}
			majorInt, err := strconv.Atoi(majMinSlice[0])
			if err != nil {
				continue
			}
			minorInt, err := strconv.Atoi(majMinSlice[1])
			if err != nil {
				continue
			}

			tmpGpu.major = majorInt
			tmpGpu.minor = minorInt

			isCard, err := regexp.MatchString("^card[0-9]+", drmEnt.Name())
			if err != nil {
				continue
			}

			if isCard {
				// If it is a card it's minor number will be its id.
				tmpGpu.id = strconv.Itoa(minorInt)
				tmp := cardIds{
					id:  tmpGpu.id,
					pci: tmpGpu.pci,
				}
				cards = append(cards, tmp)
			}
			// Find matching /dev/nvidia* entry for /dev/dri/card*
			if tmpGpu.isNvidiaGpu() && isCard {
				if !isNvidia {
					isNvidia = true
				}
				nvidiaPath := "/dev/nvidia" + strconv.Itoa(tmpGpu.minor)
				stat := syscall.Stat_t{}
				err := syscall.Stat(nvidiaPath, &stat)
				if err != nil {
					continue
				}
				tmpGpu.nvidia.path = nvidiaPath
				tmpGpu.nvidia.major = int(stat.Rdev / 256)
				tmpGpu.nvidia.minor = int(stat.Rdev % 256)
				tmpGpu.nvidia.id = strconv.Itoa(tmpGpu.nvidia.minor)
			}
			gpus = append(gpus, tmpGpu)
		}
	}

	// We detected a Nvidia card, so let's collect all other nvidia devices
	// that are not /dev/nvidia[0-9]+.
	if isNvidia {
		nvidiaEnts, err := ioutil.ReadDir("/dev")
		if err != nil {
			if os.IsNotExist(err) {
				return nil, nil, err
			}
		}
		validNvidia, err := regexp.Compile(`^nvidia[^0-9]+`)
		if err != nil {
			return nil, nil, err
		}
		for _, nvidiaEnt := range nvidiaEnts {
			if !validNvidia.MatchString(nvidiaEnt.Name()) {
				continue
			}
			nvidiaPath := filepath.Join("/dev", nvidiaEnt.Name())
			stat := syscall.Stat_t{}
			err = syscall.Stat(nvidiaPath, &stat)
			if err != nil {
				continue
			}
			tmpNividiaGpu := nvidiaGpuDevices{
				path:  nvidiaPath,
				major: int(stat.Rdev / 256),
				minor: int(stat.Rdev % 256),
			}
			nvidiaDevices = append(nvidiaDevices, tmpNividiaGpu)
		}

	}

	// Since we'll give users to ability to specify and id we need to group
	// devices on the same PCI that belong to the same card by id.
	for _, card := range cards {
		for i := 0; i < len(gpus); i++ {
			if gpus[i].pci == card.pci {
				gpus[i].id = card.id
			}
		}
	}

	return gpus, nvidiaDevices, nil
}

lxd gpu设备加载:由下可见

最终是否加载取决于Rest接口创建的request body中的config.devices.type是否是gpu以及指定的属性是否和发现上来的一致。而客户端又是如何知道vendorid,pci等信息?实际一般是需要建立GPU资源池,GPU元数据由上层管理,通过一定调度规则指定。而GPU资源的发现实际可通过类似上面的函数进行发现或者通过lspci命令发现

else if m["type"] == "gpu" {
			
			if gpus == nil {
				gpus, nvidiaDevices, err = deviceLoadGpu()
				if err != nil {
					return "", err
				}
			}

			sawNvidia := false
			for _, gpu := range gpus {
                  //最终是否加载取决于Rest接口创建的request body中的config.devices.type是否是gpu以及指定的属性是否和发现上来的一致 if (m["vendorid"] != "" && gpu.vendorid != m["vendorid"]) || (m["pci"] != "" && gpu.pci != m["pci"]) || (m["productid"] != "" && gpu.productid != m["productid"]) || (m["id"] != "" && gpu.id != m["id"]) { continue } err := c.setupUnixDevice(k, m, gpu.major, gpu.minor, gpu.path, true) if err != nil { return "", err } if gpu.nvidia.path == "" { continue } err = c.setupUnixDevice(k, m, gpu.nvidia.major, gpu.nvidia.minor, gpu.nvidia.path, true) if err != nil { return "", err } sawNvidia = true } if sawNvidia { for _, gpu := range nvidiaDevices { err := c.setupUnixDevice(k, m, gpu.major, gpu.minor, gpu.path, true) if err != nil { return "", err } } } }

  

// setupUnixDevice() creates the unix device and sets up the necessary low-level
// liblxc configuration items.
func (c *containerLXC) setupUnixDevice(devType string, dev types.Device, major int, minor int, path string, createMustSucceed bool) error {
	if c.IsPrivileged() && !runningInUserns && cgDevicesController {
         //设置设备访问白名单 err := lxcSetConfigItem(c.c, "lxc.cgroup.devices.allow", fmt.Sprintf("c %d:%d rwm", major, minor)) if err != nil { return err } } temp := types.Device{} if err := shared.DeepCopy(&dev, &temp); err != nil { return err } temp["major"] = fmt.Sprintf("%d", major) temp["minor"] = fmt.Sprintf("%d", minor) temp["path"] = path paths, err := c.createUnixDevice(temp) if err != nil { shared.LogDebug("failed to create device", log.Ctx{"err": err, "device": devType}) if createMustSucceed { return err } return nil } devPath := paths[0] tgtPath := paths[1]      //设置挂载对象 err = lxcSetConfigItem(c.c, "lxc.mount.entry", fmt.Sprintf("%s %s none bind,create=file", devPath, tgtPath)) if err != nil { return err } return nil }

  

// Unix devices handling
func (c *containerLXC) createUnixDevice(m types.Device) ([]string, error) {
	var err error
	var major, minor int

	// Our device paths
	srcPath := m["path"]
	tgtPath := strings.TrimPrefix(srcPath, "/")
	devName := fmt.Sprintf("unix.%s", strings.Replace(tgtPath, "/", "-", -1))
	devPath := filepath.Join(c.DevicesPath(), devName)//var/lib/lxd/devices/容器名称/xxxx

	// Extra checks for nesting
	if runningInUserns {
		for key, value := range m {
			if shared.StringInSlice(key, []string{"major", "minor", "mode", "uid", "gid"}) && value != "" {
				return nil, fmt.Errorf("The "%s" property may not be set when adding a device to a nested container", key)
			}
		}
	}

	// Get the major/minor of the device we want to create
	if m["major"] == "" && m["minor"] == "" {
		// If no major and minor are set, use those from the device on the host
		_, major, minor, err = deviceGetAttributes(srcPath)
		if err != nil {
			return nil, fmt.Errorf("Failed to get device attributes for %s: %s", m["path"], err)
		}
	} else if m["major"] == "" || m["minor"] == "" {
		return nil, fmt.Errorf("Both major and minor must be supplied for device: %s", m["path"])
	} else {
		major, err = strconv.Atoi(m["major"])
		if err != nil {
			return nil, fmt.Errorf("Bad major %s in device %s", m["major"], m["path"])
		}

		minor, err = strconv.Atoi(m["minor"])
		if err != nil {
			return nil, fmt.Errorf("Bad minor %s in device %s", m["minor"], m["path"])
		}
	}

	// Get the device mode
	mode := os.FileMode(0660)
	if m["mode"] != "" {
		tmp, err := deviceModeOct(m["mode"])
		if err != nil {
			return nil, fmt.Errorf("Bad mode %s in device %s", m["mode"], m["path"])
		}
		mode = os.FileMode(tmp)
	}

	if m["type"] == "unix-block" {
		mode |= syscall.S_IFBLK
	} else {
		mode |= syscall.S_IFCHR
	}

	// Get the device owner
	uid := 0
	gid := 0

	if m["uid"] != "" {
		uid, err = strconv.Atoi(m["uid"])
		if err != nil {
			return nil, fmt.Errorf("Invalid uid %s in device %s", m["uid"], m["path"])
		}
	}

	if m["gid"] != "" {
		gid, err = strconv.Atoi(m["gid"])
		if err != nil {
			return nil, fmt.Errorf("Invalid gid %s in device %s", m["gid"], m["path"])
		}
	}

	// Create the devices directory if missing
	if !shared.PathExists(c.DevicesPath()) {
		os.Mkdir(c.DevicesPath(), 0711)
		if err != nil {
			return nil, fmt.Errorf("Failed to create devices path: %s", err)
		}
	}

	// Clean any existing entry
	if shared.PathExists(devPath) {
		if runningInUserns {
			syscall.Unmount(devPath, syscall.MNT_DETACH)
		}

		err = os.Remove(devPath)
		if err != nil {
			return nil, fmt.Errorf("Failed to remove existing entry: %s", err)
		}
	}

	// Create the new entry
	if !runningInUserns {
		if err := syscall.Mknod(devPath, uint32(mode), minor|(major<<8)); err != nil {
			return nil, fmt.Errorf("Failed to create device %s for %s: %s", devPath, m["path"], err)
		}

		if err := os.Chown(devPath, uid, gid); err != nil {
			return nil, fmt.Errorf("Failed to chown device %s: %s", devPath, err)
		}

		// Needed as mknod respects the umask
		if err := os.Chmod(devPath, mode); err != nil {
			return nil, fmt.Errorf("Failed to chmod device %s: %s", devPath, err)
		}

		if c.idmapset != nil {
			if err := c.idmapset.ShiftFile(devPath); err != nil {
				// uidshift failing is weird, but not a big problem.  Log and proceed
				shared.LogDebugf("Failed to uidshift device %s: %s
", m["path"], err)
			}
		}
	} else {
		f, err := os.Create(devPath)
		if err != nil {
			return nil, err
		}
		f.Close()

		err = deviceMountDisk(srcPath, devPath, false, false)
		if err != nil {
			return nil, err
		}
	}

	return []string{devPath, tgtPath}, nil
}

  

func deviceMountDisk(srcPath string, dstPath string, readonly bool, recursive bool) error {
	var err error

	// Prepare the mount flags
	flags := 0
	if readonly {
		flags |= syscall.MS_RDONLY
	}

	// Detect the filesystem
	fstype := "none"
	if deviceIsBlockdev(srcPath) {
		fstype, err = shared.BlockFsDetect(srcPath)
		if err != nil {
			return err
		}
	} else {
		flags |= syscall.MS_BIND
		if recursive {
			flags |= syscall.MS_REC
		}
	}

	// Mount the filesystem
	if err = syscall.Mount(srcPath, dstPath, fstype, uintptr(flags), ""); err != nil {
		return fmt.Errorf("Unable to mount %s at %s: %s", srcPath, dstPath, err)
	}

	flags = syscall.MS_REC | syscall.MS_SLAVE
	if err = syscall.Mount("", dstPath, "", uintptr(flags), ""); err != nil {
		return fmt.Errorf("unable to make mount %s private: %s", dstPath, err)
	}

	return nil
}

  

原文地址:https://www.cnblogs.com/hrbeu05/p/6502150.html