Source code for itwinai.torch.monitoring.backend

# --------------------------------------------------------------------------------------
# Part of the interTwin Project: https://www.intertwin.eu/
#
# Created by: Linus Eickhoff
#
# Credit:
# - Linus Eickhoff <linus.maximilian.eickhoff@cern.ch> - CERN
# --------------------------------------------------------------------------------------

import logging
import os
from abc import ABC, abstractmethod
from types import ModuleType
from typing import Literal

py_logger = logging.getLogger(__name__)


[docs] class GPUBackend(ABC): @property @abstractmethod def man_lib(self) -> ModuleType | None: """The library used for GPU management.""" pass @property @abstractmethod def man_type(self) -> Literal["nvidia", "amd"] | None: """The type of GPU management library used.""" pass
[docs] @abstractmethod def get_handle_by_uuid(self, gpu_uuid: str) -> object: """Get the device handle for a specific GPU UUID.""" pass
[docs] @abstractmethod def get_handle_by_id(self, gpu_id: int) -> object: """Get the device handle for a specific GPU index (ID).""" pass
[docs] @abstractmethod def get_gpu_utilization(self, handle) -> float: """Get the GPU utilization (%) for a given handle.""" pass
[docs] @abstractmethod def get_gpu_power_usage(self, handle) -> float: """Get the GPU power usage (W) for a given handle.""" pass
[docs] @abstractmethod def get_visible_gpu_ids(self) -> list[int]: """Get a list of visible GPU UUIDs.""" pass
[docs] class NvidiaBackend(GPUBackend): def __init__(self): try: import pynvml as nv nv.nvmlInit() # will raise if no NVIDIA driver py_logger.info("Monitoring: NVIDIA backend set up") self._man_lib: ModuleType = nv self._man_type: Literal["nvidia", "amd"] = "nvidia" except Exception: raise RuntimeError( "Monitoring: NVIDIA backend could not be set up." " (pynvml could not be initialized)" ) @property def man_lib(self) -> ModuleType | None: return self._man_lib @property def man_type(self) -> Literal["nvidia", "amd"] | None: return self._man_type
[docs] def get_handle_by_uuid(self, gpu_uuid: str) -> object: return self._man_lib.nvmlDeviceGetHandleByUUID(str(gpu_uuid))
[docs] def get_handle_by_id(self, gpu_id: int) -> object: return self._man_lib.nvmlDeviceGetHandleByIndex(gpu_id)
[docs] def get_gpu_utilization(self, handle) -> float: """Get the GPU utilization (%) for a given handle.""" return float(self._man_lib.nvmlDeviceGetUtilizationRates(handle).gpu)
[docs] def get_gpu_power_usage(self, handle) -> float: """Get the GPU power usage (W) for a given handle.""" return float(self._man_lib.nvmlDeviceGetPowerUsage(handle) / 1000.0) # mW -> W
[docs] def get_visible_gpu_ids(self) -> list[int]: """Get a list of visible GPU UUIDs.""" visible_gpus_str = os.environ.get("CUDA_VISIBLE_DEVICES", "") if visible_gpus_str: visible_gpus = visible_gpus_str.split(",") visible_gpus = [int(id) for id in visible_gpus] else: visible_gpus = [] return visible_gpus
[docs] class AMDBackend(GPUBackend): def __init__(self): try: import amdsmi amdsmi.amdsmi_init() # will raise if no AMD driver py_logger.info("Monitoring: AMD backend set up") self._man_lib: ModuleType = amdsmi self._man_type: Literal["nvidia", "amd"] = "amd" self._devices = amdsmi.amdsmi_get_processor_handles() except Exception: raise RuntimeError( "Monitoring: AMD backend could not be set up." " (amdsmi could not be initialized)" ) @property def man_lib(self) -> ModuleType | None: return self._man_lib @property def man_type(self) -> Literal["nvidia", "amd"] | None: return self._man_type
[docs] def get_handle_by_uuid(self, gpu_uuid: str) -> object: for dev in self._devices: if self._man_lib.amdsmi_get_gpu_device_uuid(dev) == gpu_uuid: return dev raise ValueError(f"GPU with UUID {gpu_uuid} not accessible.")
[docs] def get_handle_by_id(self, gpu_id: int) -> object: try: return self._devices[gpu_id] except IndexError: raise ValueError(f"GPU with ID {gpu_id} not accessible.")
[docs] def get_gpu_utilization(self, handle) -> float: """Get the GPU utilization (%) for a given handle.""" return float(self._man_lib.amdsmi_get_gpu_activity(handle)["gfx_activity"])
[docs] def get_gpu_power_usage(self, handle) -> float: """Get the GPU power usage (W) for a given handle.""" return float(self._man_lib.amdsmi_get_power_info(handle)["average_socket_power"]) # W
[docs] def get_visible_gpu_ids(self) -> list[int]: """Get a list of visible GPU UUIDs.""" if os.environ.get("HIP_VISIBLE_DEVICES") is not None: visible_gpus_str = os.environ.get("HIP_VISIBLE_DEVICES", "") else: visible_gpus_str = os.environ.get("ROCR_VISIBLE_DEVICES", "") if visible_gpus_str: visible_gpus = visible_gpus_str.split(",") visible_gpus = [int(id) for id in visible_gpus] else: visible_gpus = [] return visible_gpus
[docs] def detect_gpu_backend() -> GPUBackend: """Detects the available GPU backend and returns an instance of the corresponding class.""" try: return NvidiaBackend() except RuntimeError as e: py_logger.warning(f"NVIDIA backend not available: {e}") try: return AMDBackend() except RuntimeError as e: py_logger.warning(f"AMD backend not available: {e}") raise RuntimeError("No compatible GPU backend found. Please install pynvml or amdsmi.")