ServerGuard/modules/gpu.py

"""
ServerGuard - 显卡检测模块

检测 GPU 信息、温度、驱动状态等。
"""

import os
import re
from typing import Dict, Any, List, Optional

import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import (
    execute_command, check_command_exists, parse_key_value_output,
    safe_int, safe_float, format_bytes
)


def run_gpu_check() -> Dict[str, Any]:
    """
    执行 GPU 检测。

    Returns:
        Dict[str, Any]: 检测结果
    """
    result = {
        "status": "success",
        "gpus": [],
        "errors": []
    }

    try:
        # 检测 NVIDIA GPU
        nvidia_gpus = check_nvidia_gpus()
        if nvidia_gpus:
            result["gpus"].extend(nvidia_gpus)

        # 检测 AMD GPU
        amd_gpus = check_amd_gpus()
        if amd_gpus:
            result["gpus"].extend(amd_gpus)

        # 检测 Intel GPU
        intel_gpus = check_intel_gpus()
        if intel_gpus:
            result["gpus"].extend(intel_gpus)

        # 如果没有找到 GPU，使用 lspci 基础检测
        if not result["gpus"]:
            result["gpus"] = check_generic_gpus()

        # 检查系统日志中的 GPU 错误
        result["dmesg_errors"] = check_gpu_dmesg_errors()

        # 如果有错误，更新状态
        if result["dmesg_errors"]:
            result["status"] = "warning"

        if not result["gpus"]:
            result["status"] = "unknown"
            result["note"] = "未检测到 GPU 设备"

    except Exception as e:
        result["status"] = "error"
        result["error"] = str(e)

    return result


def check_nvidia_gpus() -> List[Dict[str, Any]]:
    """检测 NVIDIA GPU。"""
    gpus = []

    if not check_command_exists('nvidia-smi'):
        return gpus

    try:
        # 获取 GPU 列表和基本信息
        _, stdout, _ = execute_command(
            ['nvidia-smi', '--query-gpu=gpu_name,gpu_bus_id,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current',
             '--format=csv,noheader'],
            check_returncode=False, timeout=10
        )

        for i, line in enumerate(stdout.strip().split('\n')):
            if not line.strip():
                continue

            parts = [p.strip() for p in line.split(',')]
            if len(parts) >= 4:
                gpu_info = {
                    "vendor": "NVIDIA",
                    "index": i,
                    "name": parts[0],
                    "bus_id": parts[1] if len(parts) > 1 else "unknown",
                    "pci_bus_id": parts[2] if len(parts) > 2 else "unknown",
                    "driver_version": parts[3],
                    "pstate": parts[4] if len(parts) > 4 else "unknown",
                    "pcie_max_gen": parts[5] if len(parts) > 5 else "unknown",
                    "pcie_current_gen": parts[6] if len(parts) > 6 else "unknown"
                }

                # 获取详细信息
                gpu_info.update(get_nvidia_gpu_details(i))
                gpus.append(gpu_info)

    except Exception as e:
        pass

    return gpus


def get_nvidia_gpu_details(gpu_index: int) -> Dict[str, Any]:
    """获取单个 NVIDIA GPU 的详细信息。"""
    details = {}

    try:
        # 获取温度和功耗
        _, stdout, _ = execute_command(
            ['nvidia-smi', '--query-gpu=temperature.gpu,power.draw,power.limit,clocks.gr,clocks.mem,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,serial,uuid,vbios_version',
             '--format=csv,noheader,nounits', '-i', str(gpu_index)],
            check_returncode=False, timeout=10
        )

        parts = [p.strip() for p in stdout.split(',')]
        if len(parts) >= 10:
            details["temperature_c"] = safe_int(parts[0]) if parts[0] != '[Not Supported]' else None
            details["power_draw_w"] = safe_float(parts[1]) if parts[1] != '[Not Supported]' else None
            details["power_limit_w"] = safe_float(parts[2]) if parts[2] != '[Not Supported]' else None
            details["graphics_clock_mhz"] = safe_int(parts[3]) if parts[3] != '[Not Supported]' else None
            details["memory_clock_mhz"] = safe_int(parts[4]) if parts[4] != '[Not Supported]' else None
            details["gpu_utilization_percent"] = safe_int(parts[5]) if parts[5] != '[Not Supported]' else None
            details["memory_utilization_percent"] = safe_int(parts[6]) if parts[6] != '[Not Supported]' else None
            details["memory_total_mb"] = safe_int(parts[7]) if parts[7] != '[Not Supported]' else None
            details["memory_used_mb"] = safe_int(parts[8]) if parts[8] != '[Not Supported]' else None
            details["memory_free_mb"] = safe_int(parts[9]) if parts[9] != '[Not Supported]' else None

            if len(parts) > 10:
                details["serial"] = parts[10] if parts[10] != '[Not Supported]' else None
            if len(parts) > 11:
                details["uuid"] = parts[11] if parts[11] != '[Not Supported]' else None
            if len(parts) > 12:
                details["vbios_version"] = parts[12] if parts[12] != '[Not Supported]' else None

        # 获取 ECC 状态
        _, ecc_output, _ = execute_command(
            ['nvidia-smi', '--query-gpu=ecc.mode.current,ecc.mode.pending,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total',
             '--format=csv,noheader', '-i', str(gpu_index)],
            check_returncode=False, timeout=10
        )

        ecc_parts = [p.strip() for p in ecc_output.split(',')]
        if len(ecc_parts) >= 4:
            details["ecc_mode"] = ecc_parts[0] if ecc_parts[0] != '[Not Supported]' else None
            details["ecc_pending"] = ecc_parts[1] if ecc_parts[1] != '[Not Supported]' else None
            details["ecc_corrected_errors"] = safe_int(ecc_parts[2]) if ecc_parts[2] != '[Not Supported]' else 0
            details["ecc_uncorrected_errors"] = safe_int(ecc_parts[3]) if ecc_parts[3] != '[Not Supported]' else 0

        # 获取进程信息
        _, proc_output, _ = execute_command(
            ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1', '-i', str(gpu_index)],
            check_returncode=False, timeout=5
        )

        processes = []
        for line in proc_output.split('\n')[2:]:  # 跳过表头
            if line.strip() and not line.startswith('#'):
                proc_parts = line.split()
                if len(proc_parts) >= 5:
                    processes.append({
                        "pid": proc_parts[1],
                        "type": proc_parts[2],
                        "sm_util": proc_parts[3],
                        "mem_util": proc_parts[4]
                    })

        if processes:
            details["processes"] = processes

    except:
        pass

    return details


def check_amd_gpus() -> List[Dict[str, Any]]:
    """检测 AMD GPU。"""
    gpus = []

    # 使用 radeontop 获取信息
    if check_command_exists('radeontop'):
        try:
            # radeontop 需要图形环境，使用 -d 参数输出到文件
            import tempfile

            with tempfile.NamedTemporaryFile(mode='r', suffix='.txt', delete=False) as f:
                dump_file = f.name

            try:
                _, stdout, _ = execute_command(
                    ['radeontop', '-d', dump_file, '-l', '1'],
                    check_returncode=False, timeout=5
                )

                with open(dump_file, 'r') as f:
                    output = f.read()

                gpu_info = {"vendor": "AMD"}

                # 解析 radeontop 输出
                for line in output.split('\n'):
                    if 'GPU' in line and ':' in line:
                        parts = line.split(':')
                        if len(parts) == 2:
                            key = parts[0].strip().lower().replace(' ', '_')
                            value = parts[1].strip()
                            gpu_info[key] = value

                if gpu_info:
                    gpus.append(gpu_info)

            finally:
                if os.path.exists(dump_file):
                    os.unlink(dump_file)

        except:
            pass

    # 尝试从 sysfs 获取 AMD GPU 信息
    try:
        for card in os.listdir('/sys/class/drm'):
            if card.startswith('card') and not card[-1].isdigit() or (card.startswith('card') and os.path.exists(f'/sys/class/drm/{card}/device/vendor')):
                vendor_path = f'/sys/class/drm/{card}/device/vendor'
                if os.path.exists(vendor_path):
                    with open(vendor_path, 'r') as f:
                        vendor_id = f.read().strip()

                    # AMD vendor ID 是 0x1002
                    if vendor_id == '0x1002':
                        gpu_info = {
                            "vendor": "AMD",
                            "card": card
                        }

                        # 获取设备信息
                        device_path = f'/sys/class/drm/{card}/device/device'
                        if os.path.exists(device_path):
                            with open(device_path, 'r') as f:
                                gpu_info["device_id"] = f.read().strip()

                        # 获取驱动
                        driver_path = f'/sys/class/drm/{card}/device/driver'
                        if os.path.exists(driver_path):
                            driver = os.path.basename(os.readlink(driver_path))
                            gpu_info["driver"] = driver

                        # 获取温度
                        temp_path = f'/sys/class/drm/{card}/device/hwmon/hwmon0/temp1_input'
                        if os.path.exists(temp_path):
                            with open(temp_path, 'r') as f:
                                temp_mc = safe_int(f.read().strip())
                                gpu_info["temperature_c"] = temp_mc / 1000.0

                        # 获取频率
                        freq_path = f'/sys/class/drm/{card}/device/pp_dpm_sclk'
                        if os.path.exists(freq_path):
                            with open(freq_path, 'r') as f:
                                gpu_info["core_clock_levels"] = f.read().strip()

                        gpus.append(gpu_info)

    except:
        pass

    return gpus


def check_intel_gpus() -> List[Dict[str, Any]]:
    """检测 Intel GPU。"""
    gpus = []

    # 从 sysfs 获取 Intel GPU 信息
    try:
        for card in os.listdir('/sys/class/drm'):
            if not card.startswith('card'):
                continue

            vendor_path = f'/sys/class/drm/{card}/device/vendor'
            if not os.path.exists(vendor_path):
                continue

            with open(vendor_path, 'r') as f:
                vendor_id = f.read().strip()

            # Intel vendor ID 是 0x8086
            if vendor_id == '0x8086':
                gpu_info = {
                    "vendor": "Intel",
                    "card": card
                }

                # 获取设备信息
                device_path = f'/sys/class/drm/{card}/device/device'
                if os.path.exists(device_path):
                    with open(device_path, 'r') as f:
                        gpu_info["device_id"] = f.read().strip()

                # 获取驱动
                driver_path = f'/sys/class/drm/{card}/device/driver'
                if os.path.exists(driver_path):
                    driver = os.path.basename(os.readlink(driver_path))
                    gpu_info["driver"] = driver

                # Intel GPU 通常集成，标记为集成显卡
                gpu_info["type"] = "integrated"

                gpus.append(gpu_info)

    except:
        pass

    return gpus


def check_generic_gpus() -> List[Dict[str, Any]]:
    """使用 lspci 进行通用 GPU 检测。"""
    gpus = []

    if not check_command_exists('lspci'):
        return gpus

    try:
        _, stdout, _ = execute_command(
            ['lspci', '-nn'],
            check_returncode=False, timeout=10
        )

        for line in stdout.split('\n'):
            if 'VGA' in line or '3D controller' in line or 'Display controller' in line:
                parts = line.split(': ', 1)
                if len(parts) == 2:
                    bus_id = parts[0].split()[0]
                    description = parts[1]

                    gpu_info = {
                        "bus_id": bus_id,
                        "description": description
                    }

                    # 识别厂商
                    desc_lower = description.lower()
                    if 'nvidia' in desc_lower:
                        gpu_info["vendor"] = "NVIDIA"
                    elif 'amd' in desc_lower or 'ati' in desc_lower:
                        gpu_info["vendor"] = "AMD"
                    elif 'intel' in desc_lower:
                        gpu_info["vendor"] = "Intel"
                    else:
                        gpu_info["vendor"] = "Unknown"

                    # 识别类型
                    if 'VGA' in line:
                        gpu_info["type"] = "vga"
                    elif '3D controller' in line:
                        gpu_info["type"] = "3d"
                    elif 'Display controller' in line:
                        gpu_info["type"] = "display"

                    # 获取详细信息
                    try:
                        _, detail, _ = execute_command(
                            ['lspci', '-v', '-s', bus_id],
                            check_returncode=False, timeout=5
                        )

                        # 提取驱动信息
                        driver_match = re.search(r'Kernel driver in use:\s*(\S+)', detail)
                        if driver_match:
                            gpu_info["driver"] = driver_match.group(1)

                        # 提取模块信息
                        modules_match = re.search(r'Kernel modules:\s*(.+)', detail)
                        if modules_match:
                            gpu_info["modules"] = modules_match.group(1).strip()

                    except:
                        pass

                    gpus.append(gpu_info)

    except:
        pass

    return gpus


def check_gpu_dmesg_errors() -> List[Dict[str, str]]:
    """检查 dmesg 中的 GPU 相关错误。"""
    errors = []

    if not check_command_exists('dmesg'):
        return errors

    try:
        _, stdout, _ = execute_command(
            ['dmesg'],
            check_returncode=False, timeout=10
        )

        # GPU 相关错误关键词
        gpu_error_patterns = [
            r'GPU has fallen off the bus',
            r'NVRM: Xid',
            r'nvidia.*error',
            r'amdgpu.*error',
            r'i915.*error',
            r'GPU hang',
            r'ring.*timeout',
            r'Failed to load firmware',
            r'VRAM lost',
            r'gpu.*fault',
            r' thermal ',
        ]

        for line in stdout.split('\n'):
            line_lower = line.lower()

            # 检查是否包含 GPU 相关错误
            is_gpu_error = any(
                re.search(pattern, line, re.IGNORECASE)
                for pattern in gpu_error_patterns
            )

            if is_gpu_error and ('error' in line_lower or 'fail' in line_lower or 'warn' in line_lower or 'Xid' in line):
                # 提取时间戳
                timestamp_match = re.match(r'\[\s*([\d.]+)\]', line)
                timestamp = timestamp_match.group(1) if timestamp_match else "unknown"

                errors.append({
                    "timestamp": timestamp,
                    "message": line.strip()
                })

        # 去重并限制数量
        seen = set()
        unique_errors = []
        for error in errors:
            msg = error["message"]
            if msg not in seen and len(unique_errors) < 20:
                seen.add(msg)
                unique_errors.append(error)

        return unique_errors

    except:
        return []


def get_gpu_processes() -> List[Dict[str, Any]]:
    """获取使用 GPU 的进程列表（仅 NVIDIA）。"""
    processes = []

    if not check_command_exists('nvidia-smi'):
        return processes

    try:
        _, stdout, _ = execute_command(
            ['nvidia-smi', 'pmon', '-s', 'um', '-c', '1'],
            check_returncode=False, timeout=5
        )

        lines = stdout.strip().split('\n')
        # 跳过前两行（表头）
        for line in lines[2:]:
            if line.strip() and not line.startswith('#'):
                parts = line.split()
                if len(parts) >= 8:
                    processes.append({
                        "gpu_index": safe_int(parts[0]),
                        "pid": parts[1],
                        "type": parts[2],
                        "sm_util": parts[3],
                        "mem_util": parts[4],
                        "enc_util": parts[5],
                        "dec_util": parts[6],
                        "command": parts[7]
                    })
    except:
        pass

    return processes


if __name__ == '__main__':
    import json
    print(json.dumps(run_gpu_check(), indent=2, ensure_ascii=False))